ARM public key benchmark

Niels Möller nisse at lysator.liu.se
Wed Apr 3 10:08:18 CEST 2013


nisse at lysator.liu.se (Niels Möller) writes:

> So it should be doable with the addmul_1 loop and two additional,
> non-recurrency, not instructions per limb, and then maybe some extra
> logic for the return value. One could aim for 4.25 c/l, I guess.

The below seems to give correct results. But still 5.25 c/l. Maybe
scheduling can be improved, I just put the new mvn instructions
immediately preceding umaal and str.

Regards,
/Niels

dnl  ARM mpn_submul_1.

dnl  Copyright 2012 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C	     cycles/limb
C StrongARM:	 -
C XScale	 -
C Cortex-A7	 ?
C Cortex-A8	 ?
C Cortex-A9	 5.25
C Cortex-A15	 ?

C TODO
C  * Micro-optimise feed-in code.
C  * Optimise for n=1,2 by delaying register saving.
C  * Try using ldm/stm.

define(`rp',`r0')
define(`up',`r1')
define(`n', `r2')
define(`v0',`r3')

ASM_START()
PROLOGUE(mpn_submul_1)
	stmfd	sp!, { r4, r5, r6, r7 }

	ands	r6, n, #3
	mov	r12, #0
	beq	L(fi0)
	cmp	r6, #2
	bcc	L(fi1)
	beq	L(fi2)

L(fi3):	ldr	r4, [up], #4
	ldr	r6, [rp, #0]
	ldr	r5, [up], #4
	b	L(lo3)

L(fi0):	ldr	r5, [up], #4
	ldr	r7, [rp], #4
	ldr	r4, [up], #4
	b	L(lo0)

L(fi1):	ldr	r4, [up], #4
	ldr	r6, [rp], #8
	subs	n, n, #1
	beq	L(1)
	ldr	r5, [up], #4
	b	L(lo1)

L(fi2):	ldr	r5, [up], #4
	ldr	r7, [rp], #12
	ldr	r4, [up], #4
	b	L(lo2)

	ALIGN(16)
L(top):	ldr	r6, [rp, #-8]
	ldr	r5, [up], #4
	mvn	r7, r7
	str	r7, [rp, #-12]
L(lo1):	mvn	r6, r6
	umaal	r6, r12, r4, v0
	ldr	r7, [rp, #-4]
	ldr	r4, [up], #4
	mvn	r6, r6
	str	r6, [rp, #-8]
L(lo0):	mvn	r7, r7
	umaal	r7, r12, r5, v0
	ldr	r6, [rp, #0]
	ldr	r5, [up], #4
	mvn	r7, r7
	str	r7, [rp, #-4]
L(lo3):	mvn	r6, r6
	umaal	r6, r12, r4, v0
	ldr	r7, [rp, #4]
	ldr	r4, [up], #4
	mvn	r6, r6
	str	r6, [rp], #16
L(lo2):	mvn	r7, r7
	umaal	r7, r12, r5, v0
	subs	n, n, #4
	bhi	L(top)

	ldr	r6, [rp, #-8]
	mvn	r7, r7
	str	r7, [rp, #-12]
L(1):	mvn	r6, r6
	umaal	r6, r12, r4, v0
	mvn	r6, r6
	str	r6, [rp, #-8]
	mov	r0, r12
	ldmfd	sp!, { r4, r5, r6, r7 }
	bx	lr
EPILOGUE()

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.


More information about the gmp-devel mailing list