ARM public key benchmark
Niels Möller
nisse at lysator.liu.se
Thu Apr 4 09:25:19 CEST 2013
nisse at lysator.liu.se (Niels Möller) writes:
> Not until now. Actually looks nice:
>
> A - b C = A + b (~C) + b - b B^n
>
> So this saves one not instruction, and we have to add and subtract the
> scalar b from incoming and outgoing carry.
With this trick, I get down to 4.4 c/l on A9. Code actually worked on
first attempt ;-) Below is the diff from addmul_1, and I also attach the
complete file.
I think the placement of the mvn instructions is reasonable, just after
the previous umaal, but I haven't played with the scheduling.
I'll also try using fewer updates of the up pointer, that seems to save
half a cycle, and could perhaps speed up addmul_1 too.
--- addmul_1.asm 2013-04-02 10:59:41.985393196 +0200
+++ submul_1.asm 2013-04-04 09:07:52.863842403 +0200
@@ -1,6 +1,6 @@
-dnl ARM mpn_addmul_1.
+dnl ARM mpn_submul_1.
-dnl Copyright 2012 Free Software Foundation, Inc.
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -24,8 +24,8 @@
C XScale -
C Cortex-A7 ?
C Cortex-A8 ?
-C Cortex-A9 3.25
-C Cortex-A15 4
+C Cortex-A9 4.4
+C Cortex-A15 ?
C TODO
C * Micro-optimise feed-in code.
@@ -38,27 +38,30 @@
define(`v0',`r3')
ASM_START()
-PROLOGUE(mpn_addmul_1)
+PROLOGUE(mpn_submul_1)
stmfd sp!, { r4, r5, r6, r7 }
ands r6, n, #3
- mov r12, #0
+ mov r12, v0
beq L(fi0)
cmp r6, #2
bcc L(fi1)
beq L(fi2)
L(fi3): ldr r4, [up], #4
+ mvn r4, r4
ldr r6, [rp, #0]
ldr r5, [up], #4
b L(lo3)
L(fi0): ldr r5, [up], #4
+ mvn r5, r5
ldr r7, [rp], #4
ldr r4, [up], #4
b L(lo0)
L(fi1): ldr r4, [up], #4
+ mvn r4, r4
ldr r6, [rp], #8
subs n, n, #1
beq L(1)
@@ -66,6 +69,7 @@
b L(lo1)
L(fi2): ldr r5, [up], #4
+ mvn r5, r5
ldr r7, [rp], #12
ldr r4, [up], #4
b L(lo2)
@@ -75,18 +79,22 @@
ldr r5, [up], #4
str r7, [rp, #-12]
L(lo1): umaal r6, r12, r4, v0
+ mvn r5, r5
ldr r7, [rp, #-4]
ldr r4, [up], #4
str r6, [rp, #-8]
L(lo0): umaal r7, r12, r5, v0
+ mvn r4, r4
ldr r6, [rp, #0]
ldr r5, [up], #4
str r7, [rp, #-4]
L(lo3): umaal r6, r12, r4, v0
+ mvn r5, r5
ldr r7, [rp, #4]
ldr r4, [up], #4
str r6, [rp], #16
L(lo2): umaal r7, r12, r5, v0
+ mvn r4, r4
subs n, n, #4
bhi L(top)
@@ -94,7 +102,7 @@
str r7, [rp, #-12]
L(1): umaal r6, r12, r4, v0
str r6, [rp, #-8]
- mov r0, r12
+ sub r0, v0, r12
ldmfd sp!, { r4, r5, r6, r7 }
bx lr
EPILOGUE()
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: submul_1.asm
URL: <http://gmplib.org/list-archives/gmp-devel/attachments/20130404/fc86a922/attachment.ksh>
-------------- next part --------------
--
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.
More information about the gmp-devel
mailing list