Some arm cortex-a8 improvements

Mon Apr 23 16:15:09 CEST 2012

On 04/22/2012 03:06 PM, Torbjorn Granlund wrote:
> Richard Henderson <rth at twiddle.net> writes:
> 
>   I used the following, almost certainly not appropriate for general application.
>   
> [snip]
> 
> Thanks.  I would be very useful to make GMP timing work with the kernel
> Linux running om ARM.  Do you know if there are similar problems with,
> say, NetBSD?

I have no idea.

> The new code is carefully software pipelined, and mul_1 and addmul_1 run
> faster than both the old code and your patched code, at least on A9.
> Could you please try it on A8 and see if it is at least as fast as your
> code there?

I'll throw it in the hopper some time soon.

> If you have ARMv4 (e.g., StrongARM) and/or ARMv5 (e.g., XScale) I would
> appreciate if you could check if they still work after the latest
> changes.

I do not.  The gcc*.fsffrance.org farm says there are some extant, but
they are turned off at the moment.  Inside Red Hat I believe we only have
armv7+ available.

> Do you know if there is a portable mechanism for recognising an ARM
> core, akin to x86's cpuid?

Indeed I know that the hw registers that allow such recognition
are all privileged.  For linux they best one can do is /proc/cpuinfo
or (to some extent) the values in AT_HWCAP.

FYI, I dug out the add/mul_2.asm files I was working in in February.
IIRC, they're correct as in they pass the testsuite, but I could not
show them to be faster than the add/mul_1 paths.

r~
-------------- next part --------------
dnl  ARM mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
dnl  store the result in a third limb vector.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C            cycles/limb
C StrongARM:	??
C XScale:	??
C Cortex-a8:	??

define(`rp',`r0')
define(`up',`r1')
define(`n',`r2')
define(`vp',`r3')
define(`v0',`r3')
define(`v1',`r4')
define(`ul',`r5')
define(`w0',`r6')
define(`w1',`r7')
define(`w2',`r8')
define(`w3',`r9')
define(`rl',`r10')
define(`m0',`ip')
define(`m1',`lr')

dnl Transcribed from the x86_64 version.
dnl Differences include:
dnl  - post-increment addressing instead of scaled indexed.
dnl    ul always holds "current" up limb, so no need to reload.
dnl  - m0/m1 used where x86 version uses eax/edx for multiply result.
dnl  - loads from rp occur asap in prep for what is the rmw add on x86.

ASM_START()
PROLOGUE(mpn_addmul_2)
	.fnstart
	push	{ r4-r10, lr }
	.save	{ r4-r10, lr }

	ldr	ul, [up], #4			C load first input limb
	ldr	v1, [vp, #4]			C load the 2-limb vector
	ldr	v0, [vp, #0]
	ands	ip, n, #3
	ldr	rl, [rp, #0]			C load the first addend limb

	beq	L(b0)
	cmp	ip, #2
	beq	L(b2)
	bcc	L(b1)
L(b3):
	umull	w1, w2, ul, v0
	mov	w3, #0
	add	n, n, #1
	b	L(lo3)
L(b2):
	umull	w2, w3, ul, v0
	mov	w0, #0
	add	n, n, #2
	b	L(lo2)
L(b1):
	umull	w3, w0, ul, v0
	mov	w1, #0
	sub	n, n, #1
	b	L(lo1)
L(b0):
	umull	w0, w1, ul, v0
	mov	w2, #0
	b	L(lo0)

	ALIGN(16)
L(top):
	adds	w3, w3, m0
	adcs	w0, w0, m1
	adc	w1, w1, #0
L(lo1):
	umull	m0, m1, ul, v1
	adds	rl, rl, w3
	ldr	ul, [up], #4
	str	rl, [rp, #0]
	ldr	rl, [rp, #4]!
	adcs	w0, w0, m0
	adc	w1, w1, m1
	umull	m0, m1, ul, v0
	mov	w2, #0
	adds	w0, w0, m0
	adcs	w1, w1, m1
	adc	w2, w2, #0
L(lo0):
	umull	m0, m1, ul, v1
	adds	rl, rl, w0
	ldr	ul, [up], #4
	str	rl, [rp, #0]
	ldr	rl, [rp, #4]!
	adcs	w1, w1, m0
	adc	w2, w2, m1
	umull	m0, m1, ul, v0
	mov	w3, #0
	adds	w1, w1, m0
	adcs	w2, w2, m1
	adc	w3, w3, #0
L(lo3):
	umull	m0, m1, ul, v1
	adds	rl, rl, w1
	ldr	ul, [up], #4
	str	rl, [rp, #0]
	ldr	rl, [rp, #4]!
	adcs	w2, w2, m0
	adc	w3, w3, m1
	umull	m0, m1, ul, v0
	mov	w0, #0
	adds	w2, w2, m0
	adcs	w3, w3, m1
	adc	w0, w0, #0
L(lo2):
	umull	m0, m1, ul, v1
	adds	rl, rl, w2
	ldr	ul, [up], #4
	str	rl, [rp, #0]
	ldr	rl, [rp, #4]!
	adcs	w3, w3, m0
	adc	w0, w0, m1
	umull	m0, m1, ul, v0
	subs	n, n, #4
	mov	w1, #0
	bhi	L(top)

L(end):
	adds	w3, w3, m0
	adcs	w0, w0, m1
	adc	w1, w1, #0
	umull	m0, m1, ul, v1
	adds	rl, rl, w3
	str	rl, [rp], #4
	adcs	w0, w0, m0
	adc	w1, w1, m1
	str	w0, [rp, #0]
	mov	r0, w1

	pop	{ r4-r10, pc }
	.fnend
EPILOGUE(mpn_addmul_2)
-------------- next part --------------
dnl  AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
dnl  store the result in a third limb vector.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C            cycles/limb
C StrongARM:	??
C XScale:	??
C Cortex-a8:	??

define(`rp',`r0')
define(`up',`r1')
define(`n',`r2')
define(`vp',`r3')
define(`v0',`r3')
define(`v1',`r4')
define(`ul',`r5')
define(`w0',`r6')
define(`w1',`r7')
define(`w2',`r8')
define(`w3',`r9')
define(`m0',`ip')
define(`m1',`lr')

dnl Transcribed from the x86_64 version.
dnl Differences include:
dnl  - post-increment addressing instead of scaled indexed.
dnl    ul always holds "current" up limb, so no need to reload.
dnl  - m0/m1 used where x86 version uses eax/edx for multiply result,
dnl    except when we can accumulate in place with umlal.

ASM_START()
PROLOGUE(mpn_mul_2)
	.fnstart
	push	{ r4-r9, ip, lr }
	.save	{ r4-r9, ip, lr }

	ldr	v1, [vp, #4]			C load the 2-limb vector
	ldr	v0, [vp, #0]
	ands	ip, n, #3
	ldr	ul, [up], #4			C load first input limb

	beq	L(b0)
	cmp	ip, #2
	beq	L(b2)
	bcc	L(b1)
L(b3):
	umull	w1, w2, ul, v0
	mov	w3, #0
	add	n, n, #1
	b	L(lo3)
L(b2):
	umull	w2, w3, ul, v0
	mov	w0, #0
	add	n, n, #2
	b	L(lo2)
L(b1):
	umull	w3, w0, ul, v0
	mov	w1, #0
	sub	n, n, #1
	b	L(lo1)
L(b0):
	umull	w0, w1, ul, v0
	mov	w2, #0
	b	L(lo0)

	ALIGN(16)
L(top):
	adds	w3, w3, m0
	adcs	w0, w0, m1
	adc	w1, w1, #0
L(lo1):
	umlal	w0, w1, ul, v1
	ldr	ul, [up], #4
	umull	m0, m1, ul, v0
	str	w3, [rp], #4
	mov	w2, #0
	adds	w0, w0, m0
	adcs	w1, w1, m1
	adc	w2, w2, #0
L(lo0):
	umlal	w1, w2, ul, v1
	ldr	ul, [up], #4
	umull	m0, m1, ul, v0
	str	w0, [rp], #4
	mov	w3, #0
	adds	w1, w1, m0
	adcs	w2, w2, m1
	adc	w3, w3, #0
L(lo3):
	umlal	w2, w3, ul, v1
	ldr	ul, [up], #4
	umull	m0, m1, ul, v0
	str	w1, [rp], #4
	mov	w0, #0
	adds	w2, w2, m0
	adcs	w3, w3, m1
	adc	w0, w0, #0
L(lo2):
	umlal	w3, w0, ul, v1
	ldr	ul, [up], #4
	umull	m0, m1, ul, v0
	str	w2, [rp], #4
	subs	n, n, #4
	mov	w1, #0
	bhi	L(top)

L(end):
	adds	w3, w3, m0
	adcs	w0, w0, m1
	adc	w1, w1, #0
	umlal	w0, w1, ul, v1
	str	w3, [rp, #0]
	str	w0, [rp, #4]
	mov	r0, w1

	pop	{ r4-r9, ip, pc }
	.fnend
EPILOGUE(mpn_mul_2)