neon logops
Richard Henderson
rth at twiddle.net
Fri Mar 8 09:12:31 CET 2013
Building on the copyi that tege committed the other day, use neon for the
logical operations too.
I did both a 128-bit aligned version,
> $ ./speed-128 -p 1000000000 -C -s 10,50,100,500,1000,5000,10000 mpn_and_n mpn_nand_n
> clock_gettime is 1.000ns accurate
> overhead 6.00 cycles, precision 1000000000 units of 1.00e-09 secs, CPU freq 1694.10 MHz
> mpn_and_n mpn_nand_n
> 10 #1.7987 1.8986
> 50 #0.9393 1.0692
> 100 #1.2491 1.3890
> 500 #0.8154 0.9753
> 1000 #0.7786 0.9435
> 5000 #1.4955 1.5765
> 10000 #1.6532 1.7415
and a 256-bit aligned version, just to see if having a higher ratio of
operation insns to memory insns would help,
> $ ./speed-256 -p 1000000000 -C -s 10,50,100,500,1000,5000,10000 mpn_and_n mpn_nand_n
> clock_gettime is 1.000ns accurate
> overhead 6.00 cycles, precision 1000000000 units of 1.00e-09 secs, CPU freq 1694.10 MHz
> mpn_and_n mpn_nand_n
> 10 #1.5989 1.6988
> 50 #1.0992 1.1592
> 100 #1.0393 1.0593
> 500 #1.0373 1.0413
> 1000 #1.0303 1.0313
> 5000 #1.5914 1.6003
> 10000 1.6824 #1.6768
It's a bit curious how the later is less "jaggy", but slightly slower.
r~
-------------- next part --------------
dnl ARM mpn_and_n, et al.
dnl Copyright 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C and cyc/l nand cyc/l
C StrongARM ? ?
C XScale ? ?
C Cortex-A8 ? ?
C Cortex-A9 ? ?
C Cortex-A15 0.78 0.94
define(`rp', `r0')
define(`up', `r1')
define(`vp', `r2')
define(`n', `r3')
define(`POSTOP')
ifdef(`OPERATION_and_n',`
define(`func', `mpn_and_n')
define(`LOGOP', `vand $1, $2, $3')')
ifdef(`OPERATION_andn_n',`
define(`func', `mpn_andn_n')
define(`LOGOP', `vbic $1, $2, $3')')
ifdef(`OPERATION_nand_n',`
define(`func', `mpn_nand_n')
define(`POSTOP', `vmvn $1, $1')
define(`LOGOP', `vand $1, $2, $3')')
ifdef(`OPERATION_ior_n',`
define(`func', `mpn_ior_n')
define(`LOGOP', `vorr $1, $2, $3')')
ifdef(`OPERATION_iorn_n',`
define(`func', `mpn_iorn_n')
define(`LOGOP', `vorn $1, $2, $3')')
ifdef(`OPERATION_nior_n',`
define(`func', `mpn_nior_n')
define(`POSTOP', `vmvn $1, $1')
define(`LOGOP', `vorr $1, $2, $3')')
ifdef(`OPERATION_xor_n',`
define(`func', `mpn_xor_n')
define(`LOGOP', `veor $1, $2, $3')')
ifdef(`OPERATION_xnor_n',`
define(`func', `mpn_xnor_n')
define(`POSTOP', `vmvn $1, $1')
define(`LOGOP', `veor $1, $2, $3')')
MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
ASM_START()
.fpu neon
PROLOGUE(func)
cmp n, #7
ble L(bc)
C Copy until rp is 128-bit aligned
tst rp, #4
beq L(al1)
vld1.32 {d0[0]}, [up]!
vld1.32 {d1[0]}, [vp]!
sub n, n, #1
LOGOP( d0, d0, d1)
POSTOP( d0, d0)
vst1.32 {d0[0]}, [rp]!
L(al1): tst rp, #8
beq L(al2)
vld1.32 {d0}, [up]!
vld1.32 {d1}, [vp]!
sub n, n, #2
LOGOP( d0, d0, d1)
POSTOP( d0, d0)
vst1.32 {d0}, [rp:64]!
L(al2): vld1.32 {q2}, [up]!
vld1.32 {q3}, [vp]!
subs n, n, #12
blt L(end)
ALIGN(16)
L(top): vld1.32 {q0}, [up]!
LOGOP( q2, q2, q3)
vld1.32 {q1}, [vp]!
POSTOP( q2, q2)
subs n, n, #8
vst1.32 {q2}, [rp:128]!
vld1.32 {q2}, [up]!
LOGOP( q0, q0, q1)
vld1.32 {q3}, [vp]!
POSTOP( q0, q0)
vst1.32 {q0}, [rp:128]!
bge L(top)
L(end): LOGOP( q2, q2, q3)
POSTOP( q2, q2)
vst1.32 {q2}, [rp:128]!
C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we
C arrive here via L(bc)
L(bc): tst n, #4
beq L(tl1)
vld1.32 {q0}, [up]!
vld1.32 {q1}, [vp]!
LOGOP( q0, q0, q1)
POSTOP( q0, q0)
vst1.32 {q0}, [rp]!
L(tl1): tst n, #2
beq L(tl2)
vld1.32 {d0}, [up]!
vld1.32 {d1}, [vp]!
LOGOP( d0, d0, d1)
POSTOP( d0, d0)
vst1.32 {d0}, [rp]!
L(tl2): tst n, #1
beq L(tl3)
vld1.32 {d0[0]}, [up]!
vld1.32 {d1[0]}, [vp]!
LOGOP( d0, d0, d1)
POSTOP( d0, d0)
vst1.32 {d0[0]}, [rp]!
L(tl3): bx lr
EPILOGUE()
-------------- next part --------------
dnl ARM mpn_and_n, et al.
dnl Copyright 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C and cyc/l nand cyc/l
C StrongARM ? ?
C XScale ? ?
C Cortex-A8 ? ?
C Cortex-A9 ? ?
C Cortex-A15 1 1
define(`rp', `r0')
define(`up', `r1')
define(`vp', `r2')
define(`n', `r3')
define(`POSTOP')
ifdef(`OPERATION_and_n',`
define(`func', `mpn_and_n')
define(`LOGOP', `vand $1, $2, $3')')
ifdef(`OPERATION_andn_n',`
define(`func', `mpn_andn_n')
define(`LOGOP', `vbic $1, $2, $3')')
ifdef(`OPERATION_nand_n',`
define(`func', `mpn_nand_n')
define(`POSTOP', `vmvn $1, $1')
define(`LOGOP', `vand $1, $2, $3')')
ifdef(`OPERATION_ior_n',`
define(`func', `mpn_ior_n')
define(`LOGOP', `vorr $1, $2, $3')')
ifdef(`OPERATION_iorn_n',`
define(`func', `mpn_iorn_n')
define(`LOGOP', `vorn $1, $2, $3')')
ifdef(`OPERATION_nior_n',`
define(`func', `mpn_nior_n')
define(`POSTOP', `vmvn $1, $1')
define(`LOGOP', `vorr $1, $2, $3')')
ifdef(`OPERATION_xor_n',`
define(`func', `mpn_xor_n')
define(`LOGOP', `veor $1, $2, $3')')
ifdef(`OPERATION_xnor_n',`
define(`func', `mpn_xnor_n')
define(`POSTOP', `vmvn $1, $1')
define(`LOGOP', `veor $1, $2, $3')')
MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
ASM_START()
.fpu neon
PROLOGUE(func)
cmp n, #15
ble L(bc)
C Copy until rp is 256-bit aligned
tst rp, #4
beq L(al1)
vld1.32 {d0[0]}, [up]!
vld1.32 {d1[0]}, [vp]!
sub n, n, #1
LOGOP( d0, d0, d1)
POSTOP( d0, d0)
vst1.32 {d0[0]}, [rp]!
L(al1): tst rp, #8
beq L(al2)
vld1.32 {d0}, [up]!
vld1.32 {d1}, [vp]!
sub n, n, #2
LOGOP( d0, d0, d1)
POSTOP( d0, d0)
vst1.32 {d0}, [rp:64]!
L(al2): tst rp, #16
beq L(al3)
vld1.32 {q0}, [up]!
vld1.32 {q1}, [vp]!
subs n, n, #4
LOGOP( q0, q0, q1)
POSTOP( q0, q0)
vst1.32 {q0}, [rp:128]!
L(al3): vld1.32 {q8-q9}, [up]!
vld1.32 {q10-q11}, [vp]!
subs n, n, #8+16
blt L(end)
ALIGN(16)
L(top): vld1.32 {q0-q1}, [up]!
LOGOP( q8, q8, q10)
vld1.32 {q2-q3}, [vp]!
LOGOP( q9, q9, q11)
POSTOP( q8, q8)
POSTOP( q9, q9)
vst1.32 {q8-q9}, [rp:256]!
subs n, n, #16
vld1.32 {q8-q9}, [up]!
LOGOP( q0, q0, q2)
vld1.32 {q10-q11}, [vp]!
LOGOP( q1, q1, q3)
POSTOP( q0, q0)
POSTOP( q1, q1)
vst1.32 {q0-q1}, [rp:256]!
bge L(top)
L(end): LOGOP( q8, q8, q10)
LOGOP( q9, q9, q11)
POSTOP( q8, q8)
POSTOP( q9, q9)
vst1.32 {q8-q9}, [rp:256]!
C Copy last 0-15 limbs. Note that rp is aligned after loop, but not when we
C arrive here via L(bc)
L(bc): tst n, #8
beq L(tl0)
vld1.32 {q0-q1}, [up]!
vld1.32 {q2-q3}, [vp]!
LOGOP( q0, q0, q2)
LOGOP( q1, q1, q3)
POSTOP( q0, q0)
POSTOP( q1, q1)
vst1.32 {q0-q1}, [rp]!
L(tl0): tst n, #4
beq L(tl1)
vld1.32 {q0}, [up]!
vld1.32 {q1}, [vp]!
LOGOP( q0, q0, q1)
POSTOP( q0, q0)
vst1.32 {q0}, [rp]!
L(tl1): tst n, #2
beq L(tl2)
vld1.32 {d0}, [up]!
vld1.32 {d1}, [vp]!
LOGOP( d0, d0, d1)
POSTOP( d0, d0)
vst1.32 {d0}, [rp]!
L(tl2): tst n, #1
beq L(tl3)
vld1.32 {d0[0]}, [up]!
vld1.32 {d1[0]}, [vp]!
LOGOP( d0, d0, d1)
POSTOP( d0, d0)
vst1.32 {d0[0]}, [rp]!
L(tl3): bx lr
EPILOGUE()
More information about the gmp-devel
mailing list