[Gmp-commit] /var/hg/gmp: Add ARM addlsh1_n and sublsh1_n.

Sun Apr 29 17:37:59 CEST 2012

details:   /var/hg/gmp/rev/642cd5ba0529
changeset: 14907:642cd5ba0529
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Apr 29 17:37:53 2012 +0200
description:
Add ARM addlsh1_n and sublsh1_n.

diffstat:

 ChangeLog              |    2 +
 mpn/arm/aorslsh1_n.asm |  155 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 157 insertions(+), 0 deletions(-)

diffs (170 lines):

diff -r 521b092f6517 -r 642cd5ba0529 ChangeLog

--- a/ChangeLog	Sun Apr 29 03:05:52 2012 +0200
+++ b/ChangeLog	Sun Apr 29 17:37:53 2012 +0200
@@ -1,5 +1,7 @@
 2012-04-29  Torbjorn Granlund  <tege at gmplib.org>
 
+	* mpn/arm/aorslsh1_n.asm: New file.
+
 	* mpn/arm/mod_34lsub1.asm: New file.
 
 	* mpn/arm/v6t2/divrem_1.asm: New file.
diff -r 521b092f6517 -r 642cd5ba0529 mpn/arm/aorslsh1_n.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/aorslsh1_n.asm	Sun Apr 29 17:37:53 2012 +0200
@@ -0,0 +1,155 @@
+dnl  ARM mpn_addlsh1_n and mpn_sublsh1_n
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	      addlsh1_n       sublsh1_n
+C	     cycles/limb     cycles/limb
+C StrongARM	 ?		 ?
+C XScale	 ?		 ?
+C Cortex-A8	 ?		 ?
+C Cortex-A9	 3.17		 3.7
+C Cortex-A15	 ?		 ?
+
+C TODO
+C  * The addlsh1_n code runs well, but is only barely faster than mpn_addmul_1.
+C    The sublsh1_n code could surely be tweaked, its REVCY slows down things
+C    very much.  If two insns are really needed, it might help to separate them
+C    for better micro-parallelism.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_addlsh1_n', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`SETCY',	`cmp	$1, #1')
+  define(`RETVAL',	`adc	r0, $1, #2')
+  define(`SAVECY',	`sbc	$1, $2, #0')
+  define(`RESTCY',	`adds	$1, $1, #1')
+  define(`REVCY',	`')
+  define(`INICYR',	`mov	$1, #0')
+  define(`r10r11',	`r11')
+  define(`func',	mpn_addlsh1_n)
+  define(`func_nc',	mpn_addlsh1_nc)')
+ifdef(`OPERATION_sublsh1_n', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`SETCY',	`rsbs	$1, $1, #0')
+  define(`RETVAL',	`adc	r0, $1, #1')
+  define(`SAVECY',	`sbc	$1, $1, $1')
+  define(`RESTCY',	`adds	$1, $1, #1')
+  define(`REVCY',	`sbc	$1, $1, $1
+			adds	$1, $1, #1')
+  define(`INICYR',	`mov	$1, #-1')
+  define(`r10r11',	`r10')
+  define(`func',	mpn_sublsh1_n)
+  define(`func_nc',	mpn_sublsh1_nc)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{r4-r10r11, r14}
+
+ifdef(`OPERATION_addlsh1_n', `
+	mvn	r11, #0
+')
+	INICYR(	r14)
+	subs	n, n, #3
+	blt	L(le2)			C carry clear on branch path
+
+	cmn	r0, #0			C clear carry
+	ldmia	vp!, {r8, r9, r10}
+	b	L(mid)
+
+L(top):	RESTCY(	r14)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	ADDSUBC	r6, r6, r10
+	ldmia	vp!, {r8, r9, r10}
+	REVCY(r14)
+	stmia	rp!, {r4, r5, r6}
+	adcs	r8, r8, r8
+	adcs	r9, r9, r9
+	adcs	r10, r10, r10
+	ldmia	up!, {r4, r5, r6}
+	SAVECY(	r14, r11)
+	subs	n, n, #3
+	blt	L(exi)
+	RESTCY(	r12)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	ADDSUBC	r6, r6, r10
+	ldmia	vp!, {r8, r9, r10}
+	REVCY(r12)
+	stmia	rp!, {r4, r5, r6}
+L(mid):	adcs	r8, r8, r8
+	adcs	r9, r9, r9
+	adcs	r10, r10, r10
+	ldmia	up!, {r4, r5, r6}
+	SAVECY(	r12, r11)
+	subs	n, n, #3
+	bge	L(top)
+
+	mov	r7, r12			C swap alternating...
+	mov	r12, r14		C ...carry-save...
+	mov	r14, r7			C ...registers
+
+L(exi):	RESTCY(	r12)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	ADDSUBC	r6, r6, r10
+	stmia	rp!, {r4, r5, r6}
+
+	REVCY(r12)
+L(le2):	tst	n, #1			C n = {-1,-2,-3} map to [2], [1], [0]
+	beq	L(e1)
+
+L(e02):	tst	n, #2
+	beq	L(rt0)
+	ldm	vp, {r8, r9}
+	adcs	r8, r8, r8
+	adcs	r9, r9, r9
+	ldm	up, {r4, r5}
+	SAVECY(	r12, r11)
+	RESTCY(	r14)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	stm	rp, {r4, r5}
+	b	L(rt1)
+
+L(e1):	ldr	r8, [vp]
+	adcs	r8, r8, r8
+	ldr	r4, [up]
+	SAVECY(	r12, r11)
+	RESTCY(	r14)
+	ADDSUBC	r4, r4, r8
+	str	r4, [rp]
+
+L(rt1):	mov	r14, r12
+	REVCY(r12)
+L(rt0):	RETVAL(	r14)
+	pop	{r4-r10r11, r14}
+	bx	r14
+EPILOGUE()