Possible new T3-T5 mul_1

David Miller davem at davemloft.net
Tue Apr 2 07:20:25 CEST 2013


Does the current tree even build for you when targetting t3/t4?  All
of the umulxhi's in ultrasparct3/mul_1.asm et al. don't put the
parameters inside of parenthesis and your compat macros seem to
require this.

So I get things like this:

tmp-add_n.s: Assembler messages:
tmp-add_n.s:86: Error: Illegal operands
tmp-add_n.s:88: Error: Illegal operands
tmp-add_n.s:98: Error: Illegal operands
tmp-add_n.s:103: Error: Illegal operands

I suspect you had tmp-foo.s files in your tree and just typed
make, and for some reason 'make' didn't see the dependency
properly and therefore didn't rebuild the tmp-foo.s files
from foo.asm

Anyways, we need something like the patch below to get the tree
building again.

Also attached are the before and after speed output for the
existing T3 mul_1.asm and your 4-way unrolled variant which
appears to converge to 3 c/l.
-------------- next part --------------
diff -r b696e5ab9461 mpn/sparc64/ultrasparct3/add_n.asm
--- a/mpn/sparc64/ultrasparct3/add_n.asm	Tue Apr 02 00:36:00 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/add_n.asm	Mon Apr 01 22:18:15 2013 -0700
@@ -53,9 +53,9 @@
 	add	rp, 16, rp
 	ldx	[vp-8], %g2
 	sub	n, 2, n
-	addxccc	%o4, %o5, %g3
+	addxccc(%o4, %o5, %g3)
 	stx	%g3, [rp-16]
-	addxccc	%g1, %g2, %g2
+	addxccc(%g1, %g2, %g2)
 	brgz	n, L(top)
 	 stx	%g2, [rp-8]
 
@@ -65,10 +65,10 @@
 L(final_one):
 	ldx	[up+0], %o4
 	ldx	[vp+0], %o5
-	addxccc	%o4, %o5, %g3
+	addxccc(%o4, %o5, %g3)
 	stx	%g3, [rp+0]
 
 L(done):
 	retl
-	 addxc	%g0, %g0, %o0
+	 addxc(%g0, %g0, %o0)
 EPILOGUE()
diff -r b696e5ab9461 mpn/sparc64/ultrasparct3/addmul_1.asm
--- a/mpn/sparc64/ultrasparct3/addmul_1.asm	Tue Apr 02 00:36:00 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/addmul_1.asm	Mon Apr 01 22:18:15 2013 -0700
@@ -47,17 +47,17 @@
 	ldx	[rp+8], %l3
 	mulx	%l0, v0, %o0
 	add	up, 16, up
-	umulxhi	%l0, v0, %o1
+	umulxhi(%l0, v0, %o1)
 	add	rp, 16, rp
 	mulx	%l1, v0, %o2
 	sub	n, 2, n
-	umulxhi	%l1, v0, %o3
-	addxccc	%o5, %o0, %o0
-	addxccc	%o1, %o2, %o2
-	addxc	%g0, %o3, %o5
+	umulxhi(%l1, v0, %o3)
+	addxccc(%o5, %o0, %o0)
+	addxccc(%o1, %o2, %o2)
+	addxc(%g0, %o3, %o5)
 	addcc	%l2, %o0, %o0
 	stx	%o0, [rp-16]
-	addxccc	%l3, %o2, %o2
+	addxccc(%l3, %o2, %o2)
 	brgz	n, L(top)
 	 stx	%o2, [rp-8]
 
@@ -68,14 +68,14 @@
 	ldx	[up+0], %l0
 	ldx	[rp+0], %l2
 	mulx	%l0, v0, %o0
-	umulxhi	%l0, v0, %o1
-	addxccc	%o5, %o0, %o0
-	addxc	%g0, %o1, %o5
+	umulxhi(%l0, v0, %o1)
+	addxccc(%o5, %o0, %o0)
+	addxc(%g0, %o1, %o5)
 	addcc	%l2, %o0, %o0
 	stx	%o0, [rp+0]
 
 L(done):
-	addxc	%g0, %o5, %i0
+	addxc(%g0, %o5, %i0)
 	ret
 	 restore
 EPILOGUE()
diff -r b696e5ab9461 mpn/sparc64/ultrasparct3/mul_1.asm
--- a/mpn/sparc64/ultrasparct3/mul_1.asm	Tue Apr 02 00:36:00 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/mul_1.asm	Mon Apr 01 22:18:15 2013 -0700
@@ -45,13 +45,13 @@
 	ldx	[up+8], %o4
 	mulx	%g1, v0, %g3
 	add	up, 16, up
-	umulxhi	%g1, v0, %g2
+	umulxhi(%g1, v0, %g2)
 	mulx	%o4, v0, %g1
 	add	rp, 16, rp
-	addxccc	%g3, %o5, %g3
-	umulxhi	%o4, v0, %o5
+	addxccc(%g3, %o5, %g3)
+	umulxhi(%o4, v0, %o5)
 	stx	%g3, [rp-16]
-	addxccc	%g1, %g2, %g1
+	addxccc(%g1, %g2, %g1)
 	brgz	n, L(top)
 	 stx	%g1, [rp-8]
 
@@ -61,11 +61,11 @@
 L(final_one):
 	ldx	[up+0], %g1
 	mulx	%g1, v0, %g3
-	addxccc	%g3, %o5, %g3
-	umulxhi	%g1, v0, %o5
+	addxccc(%g3, %o5, %g3)
+	umulxhi(%g1, v0, %o5)
 	stx	%g3, [rp+0]
 
 L(done):
 	retl
-	 addxc	%g0, %o5, %o0
+	 addxc(%g0, %o5, %o0)
 EPILOGUE()
diff -r b696e5ab9461 mpn/sparc64/ultrasparct3/sub_n.asm
--- a/mpn/sparc64/ultrasparct3/sub_n.asm	Tue Apr 02 00:36:00 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/sub_n.asm	Mon Apr 01 22:18:15 2013 -0700
@@ -55,10 +55,10 @@
 	ldx	[up-8], %g1
 	add	rp, 16, rp
 	xnor	%o5, %g0, %o5
-	addxccc	%o4, %o5, %g3
+	addxccc(%o4, %o5, %g3)
 	stx	%g3, [rp-16]
 	xnor	%g2, %g0, %g2
-	addxccc	%g1, %g2, %g2
+	addxccc(%g1, %g2, %g2)
 	brgz	n, L(top)
 	 stx	%g2, [rp-8]
 
@@ -69,7 +69,7 @@
 	ldx	[up+0], %o4
 	ldx	[vp+0], %o5
 	xnor	%o5, %g0, %o5
-	addxccc	%o4, %o5, %g3
+	addxccc(%o4, %o5, %g3)
 	stx	%g3, [rp+0]
 
 L(done):
diff -r b696e5ab9461 mpn/sparc64/ultrasparct3/submul_1.asm
--- a/mpn/sparc64/ultrasparct3/submul_1.asm	Tue Apr 02 00:36:00 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/submul_1.asm	Mon Apr 01 22:18:15 2013 -0700
@@ -47,17 +47,17 @@
 	ldx	[rp+8], %l3
 	mulx	%l0, v0, %o0
 	add	up, 16, up
-	umulxhi	%l0, v0, %o1
+	umulxhi(%l0, v0, %o1)
 	add	rp, 16, rp
 	mulx	%l1, v0, %o2
 	sub	n, 2, n
-	umulxhi	%l1, v0, %o3
-	addxccc	%o5, %o0, %o0
-	addxc	%g0, %o1, %o5
+	umulxhi(%l1, v0, %o3)
+	addxccc(%o5, %o0, %o0)
+	addxc(%g0, %o1, %o5)
 	subcc	%l2, %o0, %o0
 	stx	%o0, [rp-16]
-	addxccc	%o5, %o2, %o2
-	addxc	%g0, %o3, %o5
+	addxccc(%o5, %o2, %o2)
+	addxc(%g0, %o3, %o5)
 	subcc	%l3, %o2, %o2
 	brgz	n, L(top)
 	 stx	%o2, [rp-8]
@@ -69,14 +69,14 @@
 	ldx	[up+0], %l0
 	ldx	[rp+0], %l2
 	mulx	%l0, v0, %o0
-	umulxhi	%l0, v0, %o1
-	addxccc	%o5, %o0, %o0
-	addxc	%g0, %o1, %o5
+	umulxhi(%l0, v0, %o1)
+	addxccc(%o5, %o0, %o0)
+	addxc(%g0, %o1, %o5)
 	subcc	%l2, %o0, %o0
 	stx	%o0, [rp+0]
 
 L(done):
-	addxc	%g0, %o5, %i0
+	addxc(%g0, %o5, %i0)
 	ret
 	 restore
 EPILOGUE()
-------------- next part --------------
overhead 6.00 cycles, precision 10000000 units of 3.51e-10 secs, CPU freq 2847.61 MHz
          mpn_mul_1.3
1             11.0001
2              7.2501
3              5.6667
4              5.5001
5              4.9143
6              4.8334
7              4.6072
8              4.5625
9              4.4445
10             4.3125
11             4.2425
12             4.2125
13             4.1347
14             4.1191
15             4.1011
16             4.0938
17             4.0589
18             4.0252
19             4.0856
20             3.9500
22             3.9319
24             3.9167
26             3.8847
28             3.8310
30             3.8334
33             3.8447
36             3.8056
39             3.8160
42             3.7619
46             3.7479
50             3.7300
55             3.7286
60             3.7084
66             4.8334
72             4.1806
79             4.1899
86             4.0926
94             4.0639
103            4.0583
113            4.0266
124            3.9390
136            3.9062
149            3.9329
163            3.9080
179            3.8883
196            3.8062
215            3.8419
236            3.7670
259            3.8108
284            3.7324
312            3.7180
343            3.7697
377            3.7587
414            3.6800
455            3.7385
500            3.6640
550            3.6546
605            3.7141
665            3.7083
731            3.7018
804            3.6281
884            3.6188
972            3.6114
-------------- next part --------------
overhead 6.00 cycles, precision 10000000 units of 3.51e-10 secs, CPU freq 2847.55 MHz
          mpn_mul_1.3
1             12.0001
2              9.0001
3              6.7778
4              5.2501
5              5.2001
6              5.3334
7              4.9286
8              4.3750
9              4.2223
10             4.4001
11             4.3518
12             3.9034
13             3.9231
14             4.0000
15             3.9778
16             3.7001
17             3.7147
18             3.8334
19             3.8421
20             3.6000
22             3.6819
24             3.4792
26             3.5846
28             3.4286
30             3.5000
33             3.3864
36             3.3334
39             3.4103
42             3.3572
46             3.3261
50             3.3000
55             3.2909
60             3.2000
66             3.2273
72             3.1598
79             3.2026
86             3.1745
94             3.1596
103            3.1554
113            3.1195
124            3.3468
136            3.3162
149            3.3020
163            3.2945
179            3.2682
196            3.2194
215            3.2233
236            3.1822
259            3.1854
284            3.1514
312            3.1378
343            3.1400
377            3.1194
414            3.1136
455            3.1055
500            3.0860
550            3.0855
605            3.0744
665            3.0677
731            3.0657
804            3.0535
884            3.0487
972            3.0443


More information about the gmp-devel mailing list