[PATCH] Rewrite T3/T4 {add,sub}_n.asm

David Miller davem at davemloft.net
Thu Apr 4 23:29:18 CEST 2013


This meets all of the theoretical performance goals we mentioned
the other day.

T3 seems to be much more sensitive to the loop alignment than T4 is.
For example, if I take out the ALIGN(16) and the annulling branch
from add_n.asm, T3 takes an extra cycle to execute (thus 8.5 c/l)

I might want to come back to this and see if we can instead align the
whole function and either end up with the loop perfectly aligned
without any changes or insert one or two nops as necessary to achieve
that.  It'll be cheaper, either way, than the unconditional annulled
branch.

Validated with "make check" and "try".

2013-04-04  David S. Miller  <davem at davemloft.net>

	* mpn/sparc64/ultrasparct3/add_n.asm: Rewrite.
	* mpn/sparc64/ultrasparct3/sub_n.asm: Rewrite.

diff -r 01170daebe7d mpn/sparc64/ultrasparct3/add_n.asm
--- a/mpn/sparc64/ultrasparct3/add_n.asm	Thu Apr 04 03:20:03 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/add_n.asm	Thu Apr 04 14:17:00 2013 -0700
@@ -22,46 +22,86 @@
 include(`../config.m4')
 
 C		   cycles/limb
-C UltraSPARC T3:	 9
-C UltraSPARC T4:	 3.5
+C UltraSPARC T3:	 8
+C UltraSPARC T4:	 3
 
 C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`vp', `%o2')
-define(`n',  `%o3')
-define(`cy', `%o4')
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`vp', `%i2')
+define(`n',  `%i3')
+define(`cy', `%i4')
+
+define(`u0_off', `%l2')
+define(`u1_off', `%l3')
+define(`loop_n', `%l6')
+define(`tmp', `%l7')
 
 ASM_START()
 	REGISTER(%g2,#scratch)
 	REGISTER(%g3,#scratch)
 PROLOGUE(mpn_add_nc)
+	save	%sp, -176, %sp
 	b,a	L(ent)
 EPILOGUE()
 PROLOGUE(mpn_add_n)
+	save	%sp, -176, %sp
+
 	mov	0, cy
 L(ent):
 	subcc	n, 1, n
 	be	L(final_one)
 	 cmp	%g0, cy
+
+	ldx	[up + 0], %o4
+	sllx	n, 3, tmp
+
+	ldx	[vp + 0], %o5
+	add	up, tmp, u0_off
+
+	ldx	[up + 8], %g2
+	neg	tmp, loop_n
+
+	ldx	[vp + 8], %g1
+	add	u0_off, 8, u1_off
+
+	sub	loop_n, -(2 * 8), loop_n
+
+	brgez,pn loop_n, L(loop_tail)
+	 add	vp, (2 * 8), vp
+
+	b,a	L(top)
+	ALIGN(16)
 L(top):
-	ldx	[up+0], %o4
-	add	up, 16, up
-	ldx	[vp+0], %o5
-	add	vp, 16, vp
-	ldx	[up-8], %g1
-	add	rp, 16, rp
-	ldx	[vp-8], %g2
-	sub	n, 2, n
+	addxccc(%o4, %o5, tmp)
+	ldx	[vp + 0], %o5
+
+	add	rp, (2 * 8), rp
+	ldx	[loop_n + u0_off], %o4
+
+	add	vp, (2 * 8), vp
+	stx	tmp, [rp - 16]
+
+	addxccc(%g1, %g2, tmp)
+	ldx	[vp - 8], %g1
+
+	ldx	[loop_n + u1_off], %g2
+	sub	loop_n, -(2 * 8), loop_n
+
+	brlz	loop_n, L(top)
+	 stx	tmp, [rp - 8]
+
+L(loop_tail):
 	addxccc(%o4, %o5, %g3)
-	stx	%g3, [rp-16]
+	add	loop_n, u0_off, up
+
 	addxccc(%g1, %g2, %g2)
-	brgz	n, L(top)
-	 stx	%g2, [rp-8]
+	stx	%g3, [rp + 0]
 
-	brlz,pt	n, L(done)
-	 nop
+	brgz,pt	loop_n, L(done)
+	 stx	%g2, [rp + 8]
 
+	add	rp, (2 * 8), rp
 L(final_one):
 	ldx	[up+0], %o4
 	ldx	[vp+0], %o5
@@ -69,6 +109,7 @@
 	stx	%g3, [rp+0]
 
 L(done):
-	retl
-	 addxc(	%g0, %g0, %o0)
+	addxc(%g0, %g0, %i0)
+	ret
+	 restore
 EPILOGUE()
diff -r 01170daebe7d mpn/sparc64/ultrasparct3/invert_limb.asm
--- a/mpn/sparc64/ultrasparct3/invert_limb.asm	Thu Apr 04 03:20:03 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/invert_limb.asm	Thu Apr 04 14:17:00 2013 -0700
@@ -75,6 +75,7 @@
 	RODATA
 	TYPE(	approx_tab, object)
 	SIZE(	approx_tab, 512)
+	ALIGN(2)
 approx_tab:
 	.half	2045,2037,2029,2021,2013,2005,1998,1990
 	.half	1983,1975,1968,1960,1953,1946,1938,1931
diff -r 01170daebe7d mpn/sparc64/ultrasparct3/sub_n.asm
--- a/mpn/sparc64/ultrasparct3/sub_n.asm	Thu Apr 04 03:20:03 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/sub_n.asm	Thu Apr 04 14:17:00 2013 -0700
@@ -22,48 +22,101 @@
 include(`../config.m4')
 
 C		   cycles/limb
-C UltraSPARC T3:	10
-C UltraSPARC T4:	 4
+C UltraSPARC T3:	 8
+C UltraSPARC T4:	 3
 
 C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`vp', `%o2')
-define(`n',  `%o3')
-define(`cy', `%o4')
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`vp', `%i2')
+define(`n',  `%i3')
+define(`cy', `%i4')
+
+define(`u0_off', `%l0')
+define(`u1_off', `%l1')
+define(`v0_off', `%l2')
+define(`v1_off', `%l3')
+define(`r0_off', `%l4')
+define(`r1_off', `%l5')
+define(`loop_n', `%l6')
+define(`tmp', `%l7')
 
 ASM_START()
 	REGISTER(%g2,#scratch)
 	REGISTER(%g3,#scratch)
 PROLOGUE(mpn_sub_nc)
+	save	%sp, -176, %sp
 	ba,pt	%xcc, L(ent)
 	 xor	cy, 1, cy
 EPILOGUE()
 PROLOGUE(mpn_sub_n)
+	save	%sp, -176, %sp
 	mov	1, cy
 L(ent):
 	subcc	n, 1, n
 	be	L(final_one)
 	 cmp	%g0, cy
+
+	ldx	[up + 0], %o4
+	sllx	n, 3, tmp
+
+	ldx	[vp + 0], %o5
+	add	up, tmp, u0_off
+
+	ldx	[up + 8], %g2
+	add	vp, tmp, v0_off
+
+	ldx	[vp + 8], %g1
+	add	rp, tmp, r0_off
+
+	neg	tmp, loop_n
+	add	u0_off, 8, u1_off
+
+	add	v0_off, 8, v1_off
+	sub	loop_n, -(2 * 8), loop_n
+
+	sub	r0_off, 16, r0_off
+	brgez,pn loop_n, L(loop_tail)
+	 sub	r0_off, 8, r1_off
+
+	b,a	L(top)
+	ALIGN(16)
 L(top):
-	ldx	[vp+0], %o5
-	add	vp, 16, vp
-	ldx	[up+0], %o4
-	add	up, 16, up
-	ldx	[vp-8], %g2
-	sub	n, 2, n
-	ldx	[up-8], %g1
-	add	rp, 16, rp
-	xnor	%o5, %g0, %o5
-	addxccc(%o4, %o5, %g3)
-	stx	%g3, [rp-16]
-	xnor	%g2, %g0, %g2
-	addxccc(%g1, %g2, %g2)
-	brgz	n, L(top)
-	 stx	%g2, [rp-8]
+	xnor	%o5, 0, tmp
+	ldx	[loop_n + v0_off], %o5
 
-	brlz,pt	n, L(done)
-	 nop
+	addxccc(%o4, tmp, %g3)
+	ldx	[loop_n + u0_off], %o4
+
+	xnor	%g1, 0, %g1
+	stx	%g3, [loop_n + r0_off]
+
+	addxccc(%g2, %g1, tmp)
+	ldx	[loop_n + v1_off], %g1
+
+	ldx	[loop_n + u1_off], %g2
+	sub	loop_n, -(2 * 8), loop_n
+
+	brlz	loop_n, L(top)
+	 stx	tmp, [loop_n + r1_off]
+
+L(loop_tail):
+	xnor	%o5, 0, tmp
+	xnor	%g1, 0, %g1
+
+	addxccc(%o4, tmp, %g3)
+	add	loop_n, u0_off, up
+
+	addxccc(%g2, %g1, %g2)
+	add	loop_n, r0_off, rp
+
+	stx	%g3, [rp + 0]
+	add	loop_n, v0_off, vp
+
+	brgz,pt	loop_n, L(done)
+	 stx	%g2, [rp + 8]
+
+	add	rp, (2 * 8), rp
 
 L(final_one):
 	ldx	[up+0], %o4
@@ -73,7 +126,8 @@
 	stx	%g3, [rp+0]
 
 L(done):
-	clr	%o0
-	retl
-	 movcc	%xcc, 1, %o0
+	clr	%i0
+	movcc	%xcc, 1, %i0
+	ret
+	 restore
 EPILOGUE()


More information about the gmp-devel mailing list