[PATCH] Rewrite T3/T4 {add,sub}_n.asm
David Miller
davem at davemloft.net
Thu Apr 4 23:29:18 CEST 2013
This meets all of the theoretical performance goals we mentioned
the other day.
T3 seems to be much more sensitive to the loop alignment than T4 is.
For example, if I take out the ALIGN(16) and the annulling branch
from add_n.asm, T3 takes an extra cycle to execute (thus 8.5 c/l)
I might want to come back to this and see if we can instead align the
whole function and either end up with the loop perfectly aligned
without any changes or insert one or two nops as necessary to achieve
that. It'll be cheaper, either way, than the unconditional annulled
branch.
Validated with "make check" and "try".
2013-04-04 David S. Miller <davem at davemloft.net>
* mpn/sparc64/ultrasparct3/add_n.asm: Rewrite.
* mpn/sparc64/ultrasparct3/sub_n.asm: Rewrite.
diff -r 01170daebe7d mpn/sparc64/ultrasparct3/add_n.asm
--- a/mpn/sparc64/ultrasparct3/add_n.asm Thu Apr 04 03:20:03 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/add_n.asm Thu Apr 04 14:17:00 2013 -0700
@@ -22,46 +22,86 @@
include(`../config.m4')
C cycles/limb
-C UltraSPARC T3: 9
-C UltraSPARC T4: 3.5
+C UltraSPARC T3: 8
+C UltraSPARC T4: 3
C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`vp', `%o2')
-define(`n', `%o3')
-define(`cy', `%o4')
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`vp', `%i2')
+define(`n', `%i3')
+define(`cy', `%i4')
+
+define(`u0_off', `%l2')
+define(`u1_off', `%l3')
+define(`loop_n', `%l6')
+define(`tmp', `%l7')
ASM_START()
REGISTER(%g2,#scratch)
REGISTER(%g3,#scratch)
PROLOGUE(mpn_add_nc)
+ save %sp, -176, %sp
b,a L(ent)
EPILOGUE()
PROLOGUE(mpn_add_n)
+ save %sp, -176, %sp
+
mov 0, cy
L(ent):
subcc n, 1, n
be L(final_one)
cmp %g0, cy
+
+ ldx [up + 0], %o4
+ sllx n, 3, tmp
+
+ ldx [vp + 0], %o5
+ add up, tmp, u0_off
+
+ ldx [up + 8], %g2
+ neg tmp, loop_n
+
+ ldx [vp + 8], %g1
+ add u0_off, 8, u1_off
+
+ sub loop_n, -(2 * 8), loop_n
+
+ brgez,pn loop_n, L(loop_tail)
+ add vp, (2 * 8), vp
+
+ b,a L(top)
+ ALIGN(16)
L(top):
- ldx [up+0], %o4
- add up, 16, up
- ldx [vp+0], %o5
- add vp, 16, vp
- ldx [up-8], %g1
- add rp, 16, rp
- ldx [vp-8], %g2
- sub n, 2, n
+ addxccc(%o4, %o5, tmp)
+ ldx [vp + 0], %o5
+
+ add rp, (2 * 8), rp
+ ldx [loop_n + u0_off], %o4
+
+ add vp, (2 * 8), vp
+ stx tmp, [rp - 16]
+
+ addxccc(%g1, %g2, tmp)
+ ldx [vp - 8], %g1
+
+ ldx [loop_n + u1_off], %g2
+ sub loop_n, -(2 * 8), loop_n
+
+ brlz loop_n, L(top)
+ stx tmp, [rp - 8]
+
+L(loop_tail):
addxccc(%o4, %o5, %g3)
- stx %g3, [rp-16]
+ add loop_n, u0_off, up
+
addxccc(%g1, %g2, %g2)
- brgz n, L(top)
- stx %g2, [rp-8]
+ stx %g3, [rp + 0]
- brlz,pt n, L(done)
- nop
+ brgz,pt loop_n, L(done)
+ stx %g2, [rp + 8]
+ add rp, (2 * 8), rp
L(final_one):
ldx [up+0], %o4
ldx [vp+0], %o5
@@ -69,6 +109,7 @@
stx %g3, [rp+0]
L(done):
- retl
- addxc( %g0, %g0, %o0)
+ addxc(%g0, %g0, %i0)
+ ret
+ restore
EPILOGUE()
diff -r 01170daebe7d mpn/sparc64/ultrasparct3/invert_limb.asm
--- a/mpn/sparc64/ultrasparct3/invert_limb.asm Thu Apr 04 03:20:03 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/invert_limb.asm Thu Apr 04 14:17:00 2013 -0700
@@ -75,6 +75,7 @@
RODATA
TYPE( approx_tab, object)
SIZE( approx_tab, 512)
+ ALIGN(2)
approx_tab:
.half 2045,2037,2029,2021,2013,2005,1998,1990
.half 1983,1975,1968,1960,1953,1946,1938,1931
diff -r 01170daebe7d mpn/sparc64/ultrasparct3/sub_n.asm
--- a/mpn/sparc64/ultrasparct3/sub_n.asm Thu Apr 04 03:20:03 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/sub_n.asm Thu Apr 04 14:17:00 2013 -0700
@@ -22,48 +22,101 @@
include(`../config.m4')
C cycles/limb
-C UltraSPARC T3: 10
-C UltraSPARC T4: 4
+C UltraSPARC T3: 8
+C UltraSPARC T4: 3
C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`vp', `%o2')
-define(`n', `%o3')
-define(`cy', `%o4')
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`vp', `%i2')
+define(`n', `%i3')
+define(`cy', `%i4')
+
+define(`u0_off', `%l0')
+define(`u1_off', `%l1')
+define(`v0_off', `%l2')
+define(`v1_off', `%l3')
+define(`r0_off', `%l4')
+define(`r1_off', `%l5')
+define(`loop_n', `%l6')
+define(`tmp', `%l7')
ASM_START()
REGISTER(%g2,#scratch)
REGISTER(%g3,#scratch)
PROLOGUE(mpn_sub_nc)
+ save %sp, -176, %sp
ba,pt %xcc, L(ent)
xor cy, 1, cy
EPILOGUE()
PROLOGUE(mpn_sub_n)
+ save %sp, -176, %sp
mov 1, cy
L(ent):
subcc n, 1, n
be L(final_one)
cmp %g0, cy
+
+ ldx [up + 0], %o4
+ sllx n, 3, tmp
+
+ ldx [vp + 0], %o5
+ add up, tmp, u0_off
+
+ ldx [up + 8], %g2
+ add vp, tmp, v0_off
+
+ ldx [vp + 8], %g1
+ add rp, tmp, r0_off
+
+ neg tmp, loop_n
+ add u0_off, 8, u1_off
+
+ add v0_off, 8, v1_off
+ sub loop_n, -(2 * 8), loop_n
+
+ sub r0_off, 16, r0_off
+ brgez,pn loop_n, L(loop_tail)
+ sub r0_off, 8, r1_off
+
+ b,a L(top)
+ ALIGN(16)
L(top):
- ldx [vp+0], %o5
- add vp, 16, vp
- ldx [up+0], %o4
- add up, 16, up
- ldx [vp-8], %g2
- sub n, 2, n
- ldx [up-8], %g1
- add rp, 16, rp
- xnor %o5, %g0, %o5
- addxccc(%o4, %o5, %g3)
- stx %g3, [rp-16]
- xnor %g2, %g0, %g2
- addxccc(%g1, %g2, %g2)
- brgz n, L(top)
- stx %g2, [rp-8]
+ xnor %o5, 0, tmp
+ ldx [loop_n + v0_off], %o5
- brlz,pt n, L(done)
- nop
+ addxccc(%o4, tmp, %g3)
+ ldx [loop_n + u0_off], %o4
+
+ xnor %g1, 0, %g1
+ stx %g3, [loop_n + r0_off]
+
+ addxccc(%g2, %g1, tmp)
+ ldx [loop_n + v1_off], %g1
+
+ ldx [loop_n + u1_off], %g2
+ sub loop_n, -(2 * 8), loop_n
+
+ brlz loop_n, L(top)
+ stx tmp, [loop_n + r1_off]
+
+L(loop_tail):
+ xnor %o5, 0, tmp
+ xnor %g1, 0, %g1
+
+ addxccc(%o4, tmp, %g3)
+ add loop_n, u0_off, up
+
+ addxccc(%g2, %g1, %g2)
+ add loop_n, r0_off, rp
+
+ stx %g3, [rp + 0]
+ add loop_n, v0_off, vp
+
+ brgz,pt loop_n, L(done)
+ stx %g2, [rp + 8]
+
+ add rp, (2 * 8), rp
L(final_one):
ldx [up+0], %o4
@@ -73,7 +126,8 @@
stx %g3, [rp+0]
L(done):
- clr %o0
- retl
- movcc %xcc, 1, %o0
+ clr %i0
+ movcc %xcc, 1, %i0
+ ret
+ restore
EPILOGUE()
More information about the gmp-devel
mailing list