Some arm cortex-a8 improvements
Richard Henderson
rth at twiddle.net
Mon Apr 23 16:15:09 CEST 2012
On 04/22/2012 03:06 PM, Torbjorn Granlund wrote:
> Richard Henderson <rth at twiddle.net> writes:
>
> I used the following, almost certainly not appropriate for general application.
>
> [snip]
>
> Thanks. I would be very useful to make GMP timing work with the kernel
> Linux running om ARM. Do you know if there are similar problems with,
> say, NetBSD?
I have no idea.
> The new code is carefully software pipelined, and mul_1 and addmul_1 run
> faster than both the old code and your patched code, at least on A9.
> Could you please try it on A8 and see if it is at least as fast as your
> code there?
I'll throw it in the hopper some time soon.
> If you have ARMv4 (e.g., StrongARM) and/or ARMv5 (e.g., XScale) I would
> appreciate if you could check if they still work after the latest
> changes.
I do not. The gcc*.fsffrance.org farm says there are some extant, but
they are turned off at the moment. Inside Red Hat I believe we only have
armv7+ available.
> Do you know if there is a portable mechanism for recognising an ARM
> core, akin to x86's cpuid?
Indeed I know that the hw registers that allow such recognition
are all privileged. For linux they best one can do is /proc/cpuinfo
or (to some extent) the values in AT_HWCAP.
FYI, I dug out the add/mul_2.asm files I was working in in February.
IIRC, they're correct as in they pass the testsuite, but I could not
show them to be faster than the add/mul_1 paths.
r~
-------------- next part --------------
dnl ARM mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
dnl store the result in a third limb vector.
dnl Copyright 2012 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C StrongARM: ??
C XScale: ??
C Cortex-a8: ??
define(`rp',`r0')
define(`up',`r1')
define(`n',`r2')
define(`vp',`r3')
define(`v0',`r3')
define(`v1',`r4')
define(`ul',`r5')
define(`w0',`r6')
define(`w1',`r7')
define(`w2',`r8')
define(`w3',`r9')
define(`rl',`r10')
define(`m0',`ip')
define(`m1',`lr')
dnl Transcribed from the x86_64 version.
dnl Differences include:
dnl - post-increment addressing instead of scaled indexed.
dnl ul always holds "current" up limb, so no need to reload.
dnl - m0/m1 used where x86 version uses eax/edx for multiply result.
dnl - loads from rp occur asap in prep for what is the rmw add on x86.
ASM_START()
PROLOGUE(mpn_addmul_2)
.fnstart
push { r4-r10, lr }
.save { r4-r10, lr }
ldr ul, [up], #4 C load first input limb
ldr v1, [vp, #4] C load the 2-limb vector
ldr v0, [vp, #0]
ands ip, n, #3
ldr rl, [rp, #0] C load the first addend limb
beq L(b0)
cmp ip, #2
beq L(b2)
bcc L(b1)
L(b3):
umull w1, w2, ul, v0
mov w3, #0
add n, n, #1
b L(lo3)
L(b2):
umull w2, w3, ul, v0
mov w0, #0
add n, n, #2
b L(lo2)
L(b1):
umull w3, w0, ul, v0
mov w1, #0
sub n, n, #1
b L(lo1)
L(b0):
umull w0, w1, ul, v0
mov w2, #0
b L(lo0)
ALIGN(16)
L(top):
adds w3, w3, m0
adcs w0, w0, m1
adc w1, w1, #0
L(lo1):
umull m0, m1, ul, v1
adds rl, rl, w3
ldr ul, [up], #4
str rl, [rp, #0]
ldr rl, [rp, #4]!
adcs w0, w0, m0
adc w1, w1, m1
umull m0, m1, ul, v0
mov w2, #0
adds w0, w0, m0
adcs w1, w1, m1
adc w2, w2, #0
L(lo0):
umull m0, m1, ul, v1
adds rl, rl, w0
ldr ul, [up], #4
str rl, [rp, #0]
ldr rl, [rp, #4]!
adcs w1, w1, m0
adc w2, w2, m1
umull m0, m1, ul, v0
mov w3, #0
adds w1, w1, m0
adcs w2, w2, m1
adc w3, w3, #0
L(lo3):
umull m0, m1, ul, v1
adds rl, rl, w1
ldr ul, [up], #4
str rl, [rp, #0]
ldr rl, [rp, #4]!
adcs w2, w2, m0
adc w3, w3, m1
umull m0, m1, ul, v0
mov w0, #0
adds w2, w2, m0
adcs w3, w3, m1
adc w0, w0, #0
L(lo2):
umull m0, m1, ul, v1
adds rl, rl, w2
ldr ul, [up], #4
str rl, [rp, #0]
ldr rl, [rp, #4]!
adcs w3, w3, m0
adc w0, w0, m1
umull m0, m1, ul, v0
subs n, n, #4
mov w1, #0
bhi L(top)
L(end):
adds w3, w3, m0
adcs w0, w0, m1
adc w1, w1, #0
umull m0, m1, ul, v1
adds rl, rl, w3
str rl, [rp], #4
adcs w0, w0, m0
adc w1, w1, m1
str w0, [rp, #0]
mov r0, w1
pop { r4-r10, pc }
.fnend
EPILOGUE(mpn_addmul_2)
-------------- next part --------------
dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
dnl store the result in a third limb vector.
dnl Copyright 2012 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C StrongARM: ??
C XScale: ??
C Cortex-a8: ??
define(`rp',`r0')
define(`up',`r1')
define(`n',`r2')
define(`vp',`r3')
define(`v0',`r3')
define(`v1',`r4')
define(`ul',`r5')
define(`w0',`r6')
define(`w1',`r7')
define(`w2',`r8')
define(`w3',`r9')
define(`m0',`ip')
define(`m1',`lr')
dnl Transcribed from the x86_64 version.
dnl Differences include:
dnl - post-increment addressing instead of scaled indexed.
dnl ul always holds "current" up limb, so no need to reload.
dnl - m0/m1 used where x86 version uses eax/edx for multiply result,
dnl except when we can accumulate in place with umlal.
ASM_START()
PROLOGUE(mpn_mul_2)
.fnstart
push { r4-r9, ip, lr }
.save { r4-r9, ip, lr }
ldr v1, [vp, #4] C load the 2-limb vector
ldr v0, [vp, #0]
ands ip, n, #3
ldr ul, [up], #4 C load first input limb
beq L(b0)
cmp ip, #2
beq L(b2)
bcc L(b1)
L(b3):
umull w1, w2, ul, v0
mov w3, #0
add n, n, #1
b L(lo3)
L(b2):
umull w2, w3, ul, v0
mov w0, #0
add n, n, #2
b L(lo2)
L(b1):
umull w3, w0, ul, v0
mov w1, #0
sub n, n, #1
b L(lo1)
L(b0):
umull w0, w1, ul, v0
mov w2, #0
b L(lo0)
ALIGN(16)
L(top):
adds w3, w3, m0
adcs w0, w0, m1
adc w1, w1, #0
L(lo1):
umlal w0, w1, ul, v1
ldr ul, [up], #4
umull m0, m1, ul, v0
str w3, [rp], #4
mov w2, #0
adds w0, w0, m0
adcs w1, w1, m1
adc w2, w2, #0
L(lo0):
umlal w1, w2, ul, v1
ldr ul, [up], #4
umull m0, m1, ul, v0
str w0, [rp], #4
mov w3, #0
adds w1, w1, m0
adcs w2, w2, m1
adc w3, w3, #0
L(lo3):
umlal w2, w3, ul, v1
ldr ul, [up], #4
umull m0, m1, ul, v0
str w1, [rp], #4
mov w0, #0
adds w2, w2, m0
adcs w3, w3, m1
adc w0, w0, #0
L(lo2):
umlal w3, w0, ul, v1
ldr ul, [up], #4
umull m0, m1, ul, v0
str w2, [rp], #4
subs n, n, #4
mov w1, #0
bhi L(top)
L(end):
adds w3, w3, m0
adcs w0, w0, m1
adc w1, w1, #0
umlal w0, w1, ul, v1
str w3, [rp, #0]
str w0, [rp, #4]
mov r0, w1
pop { r4-r9, ip, pc }
.fnend
EPILOGUE(mpn_mul_2)
More information about the gmp-devel
mailing list