ARM public key benchmark
Niels Möller
nisse at lysator.liu.se
Wed Apr 3 10:08:18 CEST 2013
nisse at lysator.liu.se (Niels Möller) writes:
> So it should be doable with the addmul_1 loop and two additional,
> non-recurrency, not instructions per limb, and then maybe some extra
> logic for the return value. One could aim for 4.25 c/l, I guess.
The below seems to give correct results. But still 5.25 c/l. Maybe
scheduling can be improved, I just put the new mvn instructions
immediately preceding umaal and str.
Regards,
/Niels
dnl ARM mpn_submul_1.
dnl Copyright 2012 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C StrongARM: -
C XScale -
C Cortex-A7 ?
C Cortex-A8 ?
C Cortex-A9 5.25
C Cortex-A15 ?
C TODO
C * Micro-optimise feed-in code.
C * Optimise for n=1,2 by delaying register saving.
C * Try using ldm/stm.
define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`v0',`r3')
ASM_START()
PROLOGUE(mpn_submul_1)
stmfd sp!, { r4, r5, r6, r7 }
ands r6, n, #3
mov r12, #0
beq L(fi0)
cmp r6, #2
bcc L(fi1)
beq L(fi2)
L(fi3): ldr r4, [up], #4
ldr r6, [rp, #0]
ldr r5, [up], #4
b L(lo3)
L(fi0): ldr r5, [up], #4
ldr r7, [rp], #4
ldr r4, [up], #4
b L(lo0)
L(fi1): ldr r4, [up], #4
ldr r6, [rp], #8
subs n, n, #1
beq L(1)
ldr r5, [up], #4
b L(lo1)
L(fi2): ldr r5, [up], #4
ldr r7, [rp], #12
ldr r4, [up], #4
b L(lo2)
ALIGN(16)
L(top): ldr r6, [rp, #-8]
ldr r5, [up], #4
mvn r7, r7
str r7, [rp, #-12]
L(lo1): mvn r6, r6
umaal r6, r12, r4, v0
ldr r7, [rp, #-4]
ldr r4, [up], #4
mvn r6, r6
str r6, [rp, #-8]
L(lo0): mvn r7, r7
umaal r7, r12, r5, v0
ldr r6, [rp, #0]
ldr r5, [up], #4
mvn r7, r7
str r7, [rp, #-4]
L(lo3): mvn r6, r6
umaal r6, r12, r4, v0
ldr r7, [rp, #4]
ldr r4, [up], #4
mvn r6, r6
str r6, [rp], #16
L(lo2): mvn r7, r7
umaal r7, r12, r5, v0
subs n, n, #4
bhi L(top)
ldr r6, [rp, #-8]
mvn r7, r7
str r7, [rp, #-12]
L(1): mvn r6, r6
umaal r6, r12, r4, v0
mvn r6, r6
str r6, [rp, #-8]
mov r0, r12
ldmfd sp!, { r4, r5, r6, r7 }
bx lr
EPILOGUE()
--
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.
More information about the gmp-devel
mailing list