ARM public key benchmark

Thu Apr 4 09:25:19 CEST 2013

nisse at lysator.liu.se (Niels Möller) writes:

> Not until now. Actually looks nice:
>
>   A - b C = A + b (~C) + b - b B^n 
>
> So this saves one not instruction, and we have to add and subtract the
> scalar b from incoming and outgoing carry.

With this trick, I get down to 4.4 c/l on A9. Code actually worked on
first attempt ;-) Below is the diff from addmul_1, and I also attach the
complete file.

I think the placement of the mvn instructions is reasonable, just after
the previous umaal, but I haven't played with the scheduling.

I'll also try using fewer updates of the up pointer, that seems to save
half a cycle, and could perhaps speed up addmul_1 too.

--- addmul_1.asm	2013-04-02 10:59:41.985393196 +0200
+++ submul_1.asm	2013-04-04 09:07:52.863842403 +0200
@@ -1,6 +1,6 @@
-dnl  ARM mpn_addmul_1.
+dnl  ARM mpn_submul_1.
 
-dnl  Copyright 2012 Free Software Foundation, Inc.
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -24,8 +24,8 @@
 C XScale	 -
 C Cortex-A7	 ?
 C Cortex-A8	 ?
-C Cortex-A9	 3.25
-C Cortex-A15	 4
+C Cortex-A9	 4.4
+C Cortex-A15	 ?
 
 C TODO
 C  * Micro-optimise feed-in code.
@@ -38,27 +38,30 @@
 define(`v0',`r3')
 
 ASM_START()
-PROLOGUE(mpn_addmul_1)
+PROLOGUE(mpn_submul_1)
 	stmfd	sp!, { r4, r5, r6, r7 }
 
 	ands	r6, n, #3
-	mov	r12, #0
+	mov	r12, v0
 	beq	L(fi0)
 	cmp	r6, #2
 	bcc	L(fi1)
 	beq	L(fi2)
 
 L(fi3):	ldr	r4, [up], #4
+	mvn	r4, r4
 	ldr	r6, [rp, #0]
 	ldr	r5, [up], #4
 	b	L(lo3)
 
 L(fi0):	ldr	r5, [up], #4
+	mvn	r5, r5
 	ldr	r7, [rp], #4
 	ldr	r4, [up], #4
 	b	L(lo0)
 
 L(fi1):	ldr	r4, [up], #4
+	mvn	r4, r4
 	ldr	r6, [rp], #8
 	subs	n, n, #1
 	beq	L(1)
@@ -66,6 +69,7 @@
 	b	L(lo1)
 
 L(fi2):	ldr	r5, [up], #4
+	mvn	r5, r5
 	ldr	r7, [rp], #12
 	ldr	r4, [up], #4
 	b	L(lo2)
@@ -75,18 +79,22 @@
 	ldr	r5, [up], #4
 	str	r7, [rp, #-12]
 L(lo1):	umaal	r6, r12, r4, v0
+	mvn	r5, r5
 	ldr	r7, [rp, #-4]
 	ldr	r4, [up], #4
 	str	r6, [rp, #-8]
 L(lo0):	umaal	r7, r12, r5, v0
+	mvn	r4, r4
 	ldr	r6, [rp, #0]
 	ldr	r5, [up], #4
 	str	r7, [rp, #-4]
 L(lo3):	umaal	r6, r12, r4, v0
+	mvn	r5, r5
 	ldr	r7, [rp, #4]
 	ldr	r4, [up], #4
 	str	r6, [rp], #16
 L(lo2):	umaal	r7, r12, r5, v0
+	mvn	r4, r4
 	subs	n, n, #4
 	bhi	L(top)
 
@@ -94,7 +102,7 @@
 	str	r7, [rp, #-12]
 L(1):	umaal	r6, r12, r4, v0
 	str	r6, [rp, #-8]
-	mov	r0, r12
+	sub	r0, v0, r12
 	ldmfd	sp!, { r4, r5, r6, r7 }
 	bx	lr
 EPILOGUE()


-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: submul_1.asm
URL: <http://gmplib.org/list-archives/gmp-devel/attachments/20130404/fc86a922/attachment.ksh>
-------------- next part --------------


-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.