[Gmp-commit] /var/hg/gmp: 5 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun May 17 18:41:34 UTC 2015
details: /var/hg/gmp/rev/c97731cbfd6c
changeset: 16633:c97731cbfd6c
user: Torbjorn Granlund <torbjorng at google.com>
date: Sat May 16 22:33:02 2015 +0200
description:
Rewrite for speed and size.
details: /var/hg/gmp/rev/58337f05a890
changeset: 16634:58337f05a890
user: Torbjorn Granlund <torbjorng at google.com>
date: Sat May 16 22:33:13 2015 +0200
description:
ChangeLog
details: /var/hg/gmp/rev/cbf488852dc6
changeset: 16635:cbf488852dc6
user: Torbjorn Granlund <torbjorng at google.com>
date: Sun May 17 20:40:02 2015 +0200
description:
Rewrite for speed.
details: /var/hg/gmp/rev/a2d5e6b2a29e
changeset: 16636:a2d5e6b2a29e
user: Torbjorn Granlund <torbjorng at google.com>
date: Sun May 17 20:40:12 2015 +0200
description:
ChangeLog
details: /var/hg/gmp/rev/d57148a406f4
changeset: 16637:d57148a406f4
user: Torbjorn Granlund <torbjorng at google.com>
date: Sun May 17 20:41:29 2015 +0200
description:
Updates for release.
diffstat:
ChangeLog | 8 +
NEWS | 3 +
mpn/arm/v6/addmul_2.asm | 89 +++-----
mpn/arm/v6/sqr_basecase.asm | 410 +++++++++++++++++++++++--------------------
4 files changed, 265 insertions(+), 245 deletions(-)
diffs (truncated from 704 to 300 lines):
diff -r d2482eadb2c7 -r d57148a406f4 ChangeLog
--- a/ChangeLog Fri May 15 21:42:17 2015 +0200
+++ b/ChangeLog Sun May 17 20:41:29 2015 +0200
@@ -1,3 +1,11 @@
+2015-05-17 Torbjörn Granlund <torbjorng at google.com>
+
+ * mpn/arm/v6/sqr_basecase.asm: Rewrite for speed.
+
+2015-05-16 Torbjörn Granlund <torbjorng at google.com>
+
+ * mpn/arm/v6/addmul_2.asm: Rewrite for speed and size.
+
2015-05-15 Torbjörn Granlund <torbjorng at google.com>
* mpn/arm/v7a/cora7/gmp-mparam.h: New file.
diff -r d2482eadb2c7 -r d57148a406f4 NEWS
--- a/NEWS Fri May 15 21:42:17 2015 +0200
+++ b/NEWS Sun May 17 20:41:29 2015 +0200
@@ -6,6 +6,9 @@
Changes between GMP version 6.0.* and 6.1.0
+ SPEEDUPS
+ * Speedup for Intel Broadwell.
+
FEATURES
* New C++ functions gcd and lcm for mpz_class.
diff -r d2482eadb2c7 -r d57148a406f4 mpn/arm/v6/addmul_2.asm
--- a/mpn/arm/v6/addmul_2.asm Fri May 15 21:42:17 2015 +0200
+++ b/mpn/arm/v6/addmul_2.asm Sun May 17 20:41:29 2015 +0200
@@ -2,7 +2,7 @@
dnl Contributed to the GNU project by Torbjörn Granlund.
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
@@ -35,20 +35,12 @@
C cycles/limb
C StrongARM: -
C XScale -
-C ARM11 5.5
-C Cortex-A7 3.88
-C Cortex-A8 5.5
+C ARM11 4.68
+C Cortex-A7 3.625
+C Cortex-A8 4
C Cortex-A9 2.25
C Cortex-A15 2.5
-C This is believed to be optimal for A15 for any unrolling, and optimal for A9
-C for 4-way unrolling. Using separate pointer update instructions is necessary
-C for optimal A9 speed.
-
-C TODO:
-C * Start the first multiply or multiplies directly at function entry.
-
-
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
@@ -65,75 +57,66 @@
ASM_START()
PROLOGUE(mpn_addmul_2)
- push { r4, r5, r6, r7, r8, r9 }
+ push { r4-r9 }
- ldm vp, { v0, v1 }
+ ldrd v0, v1, [vp, #0]
mov cya, #0
mov cyb, #0
tst n, #1
beq L(evn)
-L(odd): ldr r5, [rp, #0]
- ldr u0, [up, #0]
- ldr r4, [rp, #4]
+L(odd): ldr u1, [up, #0]
+ ldr r4, [rp, #0]
tst n, #2
beq L(fi1)
-L(fi3): sub up, up, #12
- sub rp, rp, #12
+L(fi3): sub up, up, #8
+ sub rp, rp, #8
b L(lo3)
L(fi1): sub n, n, #1
- sub up, up, #4
- sub rp, rp, #4
- b L(lo1)
+ b L(top)
-L(evn): ldr r4, [rp, #0]
- ldr u1, [up, #0]
- ldr r5, [rp, #4]
+L(evn): ldr u0, [up, #0]
+ ldr r5, [rp, #0]
tst n, #2
bne L(fi2)
-L(fi0): sub up, up, #8
- sub rp, rp, #8
+L(fi0): sub up, up, #4
+ sub rp, rp, #4
b L(lo0)
-L(fi2): subs n, n, #2
- bls L(end)
+L(fi2): sub up, up, #12
+ sub rp, rp, #12
+ b L(lo2)
ALIGN(16)
-L(top): ldr u0, [up, #4]
+L(top): ldr r5, [rp, #4]
umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
str r4, [rp, #0]
- ldr r4, [rp, #8]
+L(lo0): ldr r4, [rp, #8]
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #8]
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #4]
+L(lo3): ldr r5, [rp, #12]
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #12]
umaal r5, cyb, u1, v1
-L(lo1): ldr u1, [up, #8]
+ str r4, [rp, #8]
+L(lo2): ldr r4, [rp, #16]!
umaal r5, cya, u0, v0
- str r5, [rp, #4]
- ldr r5, [rp, #12]
+ ldr u1, [up, #16]!
umaal r4, cyb, u0, v1
-L(lo0): ldr u0, [up, #12]
- umaal r4, cya, u1, v0
- str r4, [rp, #8]
- ldr r4, [rp, #16]
- umaal r5, cyb, u1, v1
-L(lo3): ldr u1, [up, #16]
- umaal r5, cya, u0, v0
- str r5, [rp, #12]
- ldr r5, [rp, #20]
- add rp, rp, #16
- umaal r4, cyb, u0, v1
- add up, up, #16
+ str r5, [rp, #-4]
subs n, n, #4
bhi L(top)
L(end): umaal r4, cya, u1, v0
- ldr u0, [up, #4]
- umaal r5, cyb, u1, v1
+ umaal cya, cyb, u1, v1
str r4, [rp, #0]
- umaal r5, cya, u0, v0
- umaal cya, cyb, u0, v1
- str r5, [rp, #4]
- str cya, [rp, #8]
+ str cya, [rp, #4]
mov r0, cyb
- pop { r4, r5, r6, r7, r8, r9 }
+ pop { r4-r9 }
bx r14
EPILOGUE()
diff -r d2482eadb2c7 -r d57148a406f4 mpn/arm/v6/sqr_basecase.asm
--- a/mpn/arm/v6/sqr_basecase.asm Fri May 15 21:42:17 2015 +0200
+++ b/mpn/arm/v6/sqr_basecase.asm Sun May 17 20:41:29 2015 +0200
@@ -2,7 +2,7 @@
dnl Contributed to the GNU project by Torbjörn Granlund.
-dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
@@ -49,21 +49,25 @@
C \ /
C \ /
C \ /
-C tail(0m2) tail(1m2)
+C cor3 cor2
C \ /
C \ /
C sqr_diag_addlsh1
C TODO
+C * Align more labels.
C * Further tweak counter and updates in outer loops. (This could save
C perhaps 5n cycles).
+C * Avoid sub-with-lsl in outer loops. We could keep n up-shifted, then
+C initialise loop counter i with a right shift.
C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved.
C (This could save 2-3 cycles for n > 4.)
-C * Optimise sqr_diag_addlsh1 loop. (This could save O(n) cycles.)
-C * Implement larger final corners (xit/tix). Also stop loops earlier
-C suppressing writes of upper-most rp[] values. (This could save 10-20
-C cycles for n > 4.)
-C * Is the branch table really faster than discrete branches?
+C * Optimise sqr_diag_addlsh1 loop. The current code uses old-style carry
+C propagation.
+C * Stop loops earlier suppressing writes of upper-most rp[] values.
+C * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly
+C particularly on Cortex-A8.
+
define(`rp', r0)
define(`up', r1)
@@ -95,7 +99,7 @@
b L(3m4)
-L(1m4): push {r4-r10,r11,r14}
+L(1m4): push {r4-r11, r14}
mov n_saved, n
sub i, n, #4
sub n, n, #2
@@ -109,7 +113,7 @@
mov r4, #0
b L(ko0)
-L(3m4): push {r4-r10,r11,r14}
+L(3m4): push {r4-r11, r14}
mov n_saved, n
sub i, n, #4
sub n, n, #2
@@ -123,7 +127,7 @@
mov r4, #0
b L(ko2)
-L(2m4): push {r4-r10,r11,r14}
+L(2m4): push {r4-r11, r14}
mov n_saved, n
sub i, n, #4
sub n, n, #2
@@ -136,7 +140,7 @@
mov r5, #0
b L(ko1)
-L(0m4): push {r4-r10,r11,r14}
+L(0m4): push {r4-r11, r14}
mov n_saved, n
sub i, n, #4
sub n, n, #2
@@ -171,45 +175,7 @@
umaal r4, cyb, u0, v1
subs i, i, #4
bhi L(top)
- bx r10
-L(evnloop):
- subs i, n, #4
- sub n, n, #2
- blt L(tix)
- ldm up, {v0,v1,u0}
- add up, up, #4
- mov cya, #0
- mov cyb, #0
- ldm rp, {r4,r5}
- sub rp, rp, #4
- umaal r4, cya, v1, v0
- str r4, [rp, #4]
- ldr r4, [rp, #12]
- b L(lo2)
-L(ua2): ldr u0, [up, #4]
- umaal r4, cya, u1, v0
- str r4, [rp, #4]
- ldr r4, [rp, #12]
- umaal r5, cyb, u1, v1
-L(lo2): ldr u1, [up, #8]
- umaal r5, cya, u0, v0
- str r5, [rp, #8]
- ldr r5, [rp, #16]
- umaal r4, cyb, u0, v1
- ldr u0, [up, #12]
- umaal r4, cya, u1, v0
- str r4, [rp, #12]
- ldr r4, [rp, #20]
- umaal r5, cyb, u1, v1
- ldr u1, [up, #16]!
- umaal r5, cya, u0, v0
- str r5, [rp, #16]!
- ldr r5, [rp, #8]
- umaal r4, cyb, u0, v1
- subs i, i, #4
- bhi L(ua2)
-L(am2_0m4):
umaal r4, cya, u1, v0
ldr u0, [up, #4]
umaal r5, cyb, u1, v1
@@ -219,177 +185,237 @@
str r5, [rp, #8]
More information about the gmp-commit
mailing list