[Gmp-commit] /var/hg/gmp: Ad ARM v6 sqr_basecase.
mercurial at gmplib.org
mercurial at gmplib.org
Mon Apr 30 22:28:53 CEST 2012
details: /var/hg/gmp/rev/478bb542a457
changeset: 14924:478bb542a457
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Apr 30 22:28:50 2012 +0200
description:
Ad ARM v6 sqr_basecase.
diffstat:
ChangeLog | 4 +
mpn/arm/v6/sqr_basecase.asm | 511 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 515 insertions(+), 0 deletions(-)
diffs (truncated from 526 to 300 lines):
diff -r 2073e4006eea -r 478bb542a457 ChangeLog
--- a/ChangeLog Mon Apr 30 20:13:43 2012 +0200
+++ b/ChangeLog Mon Apr 30 22:28:50 2012 +0200
@@ -1,3 +1,7 @@
+2012-04-30 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/arm/v6/sqr_basecase.asm: New file.
+
2012-04-30 Marco Bodrato <bodrato at mail.dm.unipi.it>
* mpn/generic/comb_tables.c: New file.
diff -r 2073e4006eea -r 478bb542a457 mpn/arm/v6/sqr_basecase.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/v6/sqr_basecase.asm Mon Apr 30 22:28:50 2012 +0200
@@ -0,0 +1,511 @@
+dnl ARM v6 mpn_sqr_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Code structure:
+C
+C
+C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
+C | | | |
+C | | | |
+C | | | |
+C \|/ \|/ \|/ \|/
+C ____________ ____________
+C / \ / \
+C \|/ \ \|/ \
+C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
+C \ /|\ \ /|\
+C \____________/ \____________/
+C \ /
+C \ /
+C \ /
+C tail(0m2) tail(1m2)
+C \ /
+C \ /
+C sqr_diag_addlsh1
+
+C TODO
+C * Further tweak counter and updates in outer loops. (This could save
+C perhaps 5n cycles).
+C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved.
+C (This could save 2-3 cycles for n > 4.)
+C * Optimise sqr_diag_addlsh1 loop. (This could save O(n) cycles.)
+C * Implement larger final corners (xit/tix). Also stop loops earlier
+C suppressing writes of upper-most rp[] values. (This could save 10-20
+C cycles for n > 4.)
+C * Is the branch really faster than discrete branches?
+
+define(`rp', r0)
+define(`up', r1)
+define(`n', r2)
+
+define(`v0', r3)
+define(`v1', r6)
+define(`i', r8)
+define(`n_saved', r14)
+define(`cya', r11)
+define(`cyb', r12)
+define(`u0', r7)
+define(`u1', r9)
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+ and r12, n, #3
+ cmp n, #4
+ addgt r12, r12, #4
+ add pc, pc, r12, lsl #2
+ nop
+ b L(4)
+ b L(1)
+ b L(2)
+ b L(3)
+ b L(0m4)
+ b L(1m4)
+ b L(2m4)
+ b L(3m4)
+
+
+L(1m4): push {r4-r10,r11,r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_2m4)-.-8
+ ldm up, {v0,v1,u0}
+ sub up, up, #4
+ mov cyb, #0
+ mov r5, #0
+ umull r4, cya, v1, v0
+ str r4, [rp], #-12
+ mov r4, #0
+ b L(ko0)
+
+L(3m4): push {r4-r10,r11,r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_0m4)-.-8
+ ldm up, {v0,v1,u0}
+ add up, up, #4
+ mov cyb, #0
+ mov r5, #0
+ umull r4, cya, v1, v0
+ str r4, [rp], #-4
+ mov r4, #0
+ b L(ko2)
+
+L(2m4): push {r4-r10,r11,r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_3m4)-.-8
+ ldm up, {v0,v1,u1}
+ mov cyb, #0
+ mov r4, #0
+ umull r5, cya, v1, v0
+ str r5, [rp], #-8
+ mov r5, #0
+ b L(ko1)
+
+L(0m4): push {r4-r10,r11,r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_1m4)-.-8
+ ldm up, {v0,v1,u1}
+ mov cyb, #0
+ mov r4, #0
+ add up, up, #8
+ umull r5, cya, v1, v0
+ str r5, [rp, #0]
+ mov r5, #0
+
+L(top): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ mov r4, #0
+ umaal r5, cyb, u1, v1
+L(ko2): ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #8]
+ mov r5, #0
+ umaal r4, cyb, u0, v1
+L(ko1): ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #12]
+ mov r4, #0
+ umaal r5, cyb, u1, v1
+L(ko0): ldr u1, [up, #16]!
+ umaal r5, cya, u0, v0
+ str r5, [rp, #16]!
+ mov r5, #0
+ umaal r4, cyb, u0, v1
+ subs i, i, #4
+ bhi L(top)
+ bx r10
+
+L(evnloop):
+ subs i, n, #4
+ sub n, n, #2
+ blt L(tix)
+ ldm up, {v0,v1,u0}
+ add up, up, #4
+ mov cya, #0
+ mov cyb, #0
+ ldm rp, {r4,r5}
+ sub rp, rp, #4
+ umaal r4, cya, v1, v0
+ str r4, [rp, #4]
+ ldr r4, [rp, #12]
+ b L(lo2)
+L(ua2): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ ldr r4, [rp, #12]
+ umaal r5, cyb, u1, v1
+L(lo2): ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #8]
+ ldr r5, [rp, #16]
+ umaal r4, cyb, u0, v1
+ ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #12]
+ ldr r4, [rp, #20]
+ umaal r5, cyb, u1, v1
+ ldr u1, [up, #16]!
+ umaal r5, cya, u0, v0
+ str r5, [rp, #16]!
+ ldr r5, [rp, #8]
+ umaal r4, cyb, u0, v1
+ subs i, i, #4
+ bhi L(ua2)
+L(am2_0m4):
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #4]
+ umaal r5, cya, u0, v0
+ umaal cya, cyb, u0, v1
+ str r5, [rp, #8]
+ str cya, [rp, #12]
+ str cyb, [rp, #16]
+ sub up, up, n, lsl #2
+ sub rp, rp, n, lsl #2
+ add up, up, #8
+ sub i, n, #4
+ sub n, n, #2
+ ldm up, {v0,v1,u0}
+ sub up, up, #4
+ mov cya, #0
+ mov cyb, #0
+ ldr r4, [rp, #24]
+ ldr r5, [rp, #28]
+ add rp, rp, #12
+ umaal r4, cya, v1, v0
+ str r4, [rp, #12]
+ ldr r4, [rp, #20]
+ b L(lo0)
+L(ua0): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ ldr r4, [rp, #12]
+ umaal r5, cyb, u1, v1
+ ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #8]
+ ldr r5, [rp, #16]
+ umaal r4, cyb, u0, v1
+ ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #12]
+ ldr r4, [rp, #20]
+ umaal r5, cyb, u1, v1
+L(lo0): ldr u1, [up, #16]!
+ umaal r5, cya, u0, v0
+ str r5, [rp, #16]!
+ ldr r5, [rp, #8]
+ umaal r4, cyb, u0, v1
+ subs i, i, #4
+ bhi L(ua0)
+L(am2_2m4):
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #4]
+ umaal r5, cya, u0, v0
+ umaal cya, cyb, u0, v1
+ str r5, [rp, #8]
+ str cya, [rp, #12]
+ str cyb, [rp, #16]
+ sub up, up, n, lsl #2
+ sub rp, rp, n, lsl #2
+ add up, up, #8
+ add rp, rp, #24
+ b L(evnloop)
+
+
+L(oddloop):
+ subs i, n, #4
+ sub n, n, #2
+ blt L(xit)
+ ldm up, {v0,v1,u1}
+ mov cya, #0
+ mov cyb, #0
+ sub rp, rp, #8
+ ldr r5, [rp, #8]
+ ldr r4, [rp, #12]
+ umaal r5, cya, v1, v0
+ str r5, [rp, #8]
+ ldr r5, [rp, #16]
+ b L(lo1)
+L(ua1): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ ldr r4, [rp, #12]
+ umaal r5, cyb, u1, v1
+ ldr u1, [up, #8]
More information about the gmp-commit
mailing list