neon logops

Richard Henderson rth at twiddle.net
Fri Mar 8 09:12:31 CET 2013


Building on the copyi that tege committed the other day, use neon for the 
logical operations too.

I did both a 128-bit aligned version,

> $ ./speed-128 -p 1000000000 -C -s 10,50,100,500,1000,5000,10000 mpn_and_n mpn_nand_n
> clock_gettime is 1.000ns accurate
> overhead 6.00 cycles, precision 1000000000 units of 1.00e-09 secs, CPU freq 1694.10 MHz
>             mpn_and_n    mpn_nand_n
> 10            #1.7987        1.8986
> 50            #0.9393        1.0692
> 100           #1.2491        1.3890
> 500           #0.8154        0.9753
> 1000          #0.7786        0.9435
> 5000          #1.4955        1.5765
> 10000         #1.6532        1.7415

and a 256-bit aligned version, just to see if having a higher ratio of 
operation insns to memory insns would help,

> $ ./speed-256 -p 1000000000 -C -s 10,50,100,500,1000,5000,10000 mpn_and_n mpn_nand_n
> clock_gettime is 1.000ns accurate
> overhead 6.00 cycles, precision 1000000000 units of 1.00e-09 secs, CPU freq 1694.10 MHz
>             mpn_and_n    mpn_nand_n
> 10            #1.5989        1.6988
> 50            #1.0992        1.1592
> 100           #1.0393        1.0593
> 500           #1.0373        1.0413
> 1000          #1.0303        1.0313
> 5000          #1.5914        1.6003
> 10000          1.6824       #1.6768

It's a bit curious how the later is less "jaggy", but slightly slower.


r~
-------------- next part --------------
dnl  ARM mpn_and_n, et al.

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C	     and cyc/l		nand cyc/l
C StrongARM	 ?		    ?
C XScale	 ?		    ?
C Cortex-A8	 ?		    ?
C Cortex-A9	 ?		    ?
C Cortex-A15	 0.78		    0.94

define(`rp', `r0')
define(`up', `r1')
define(`vp', `r2')
define(`n',  `r3')

define(`POSTOP')

ifdef(`OPERATION_and_n',`
  define(`func',    `mpn_and_n')
  define(`LOGOP',   `vand	$1, $2, $3')')
ifdef(`OPERATION_andn_n',`
  define(`func',    `mpn_andn_n')
  define(`LOGOP',   `vbic	$1, $2, $3')')
ifdef(`OPERATION_nand_n',`
  define(`func',    `mpn_nand_n')
  define(`POSTOP',  `vmvn	$1, $1')
  define(`LOGOP',   `vand	$1, $2, $3')')
ifdef(`OPERATION_ior_n',`
  define(`func',    `mpn_ior_n')
  define(`LOGOP',   `vorr	$1, $2, $3')')
ifdef(`OPERATION_iorn_n',`
  define(`func',    `mpn_iorn_n')
  define(`LOGOP',   `vorn	$1, $2, $3')')
ifdef(`OPERATION_nior_n',`
  define(`func',    `mpn_nior_n')
  define(`POSTOP',  `vmvn	$1, $1')
  define(`LOGOP',   `vorr	$1, $2, $3')')
ifdef(`OPERATION_xor_n',`
  define(`func',    `mpn_xor_n')
  define(`LOGOP',   `veor	$1, $2, $3')')
ifdef(`OPERATION_xnor_n',`
  define(`func',    `mpn_xnor_n')
  define(`POSTOP',  `vmvn	$1, $1')
  define(`LOGOP',   `veor	$1, $2, $3')')

MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)

ASM_START()
	.fpu	neon
PROLOGUE(func)
	cmp		n, #7
	ble		L(bc)

C Copy until rp is 128-bit aligned
	tst		rp, #4
	beq		L(al1)
	vld1.32		{d0[0]}, [up]!
	vld1.32		{d1[0]}, [vp]!
	sub		n, n, #1
	LOGOP(		d0, d0, d1)
	POSTOP(		d0, d0)
	vst1.32		{d0[0]}, [rp]!
L(al1):	tst		rp, #8
	beq		L(al2)
	vld1.32		{d0}, [up]!
	vld1.32		{d1}, [vp]!
	sub		n, n, #2
	LOGOP(		d0, d0, d1)
	POSTOP(		d0, d0)
	vst1.32		{d0}, [rp:64]!
L(al2):	vld1.32		{q2}, [up]!
	vld1.32		{q3}, [vp]!
	subs		n, n, #12
	blt		L(end)

	ALIGN(16)
L(top):	vld1.32		{q0}, [up]!
	LOGOP(		q2, q2, q3)
	vld1.32		{q1}, [vp]!
	POSTOP(		q2, q2)
	subs		n, n, #8
	vst1.32		{q2}, [rp:128]!
	vld1.32		{q2}, [up]!
	LOGOP(		q0, q0, q1)
	vld1.32		{q3}, [vp]!
	POSTOP(		q0, q0)
	vst1.32		{q0}, [rp:128]!
	bge	L(top)

L(end):	LOGOP(		q2, q2, q3)
	POSTOP(		q2, q2)
	vst1.32		{q2}, [rp:128]!

C Copy last 0-7 limbs.  Note that rp is aligned after loop, but not when we
C arrive here via L(bc)
L(bc):	tst		n, #4
	beq		L(tl1)
	vld1.32		{q0}, [up]!
	vld1.32		{q1}, [vp]!
	LOGOP(		q0, q0, q1)
	POSTOP(		q0, q0)
	vst1.32		{q0}, [rp]!
L(tl1):	tst		n, #2
	beq		L(tl2)
	vld1.32		{d0}, [up]!
	vld1.32		{d1}, [vp]!
	LOGOP(		d0, d0, d1)
	POSTOP(		d0, d0)
	vst1.32		{d0}, [rp]!
L(tl2):	tst		n, #1
	beq		L(tl3)
	vld1.32		{d0[0]}, [up]!
	vld1.32		{d1[0]}, [vp]!
	LOGOP(		d0, d0, d1)
	POSTOP(		d0, d0)
	vst1.32		{d0[0]}, [rp]!
L(tl3):	bx		lr
EPILOGUE()
-------------- next part --------------
dnl  ARM mpn_and_n, et al.

dnl  Copyright 2013 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C	     and cyc/l		nand cyc/l
C StrongARM	 ?		    ?
C XScale	 ?		    ?
C Cortex-A8	 ?		    ?
C Cortex-A9	 ?		    ?
C Cortex-A15	 1		    1

define(`rp', `r0')
define(`up', `r1')
define(`vp', `r2')
define(`n',  `r3')

define(`POSTOP')

ifdef(`OPERATION_and_n',`
  define(`func',    `mpn_and_n')
  define(`LOGOP',   `vand	$1, $2, $3')')
ifdef(`OPERATION_andn_n',`
  define(`func',    `mpn_andn_n')
  define(`LOGOP',   `vbic	$1, $2, $3')')
ifdef(`OPERATION_nand_n',`
  define(`func',    `mpn_nand_n')
  define(`POSTOP',  `vmvn	$1, $1')
  define(`LOGOP',   `vand	$1, $2, $3')')
ifdef(`OPERATION_ior_n',`
  define(`func',    `mpn_ior_n')
  define(`LOGOP',   `vorr	$1, $2, $3')')
ifdef(`OPERATION_iorn_n',`
  define(`func',    `mpn_iorn_n')
  define(`LOGOP',   `vorn	$1, $2, $3')')
ifdef(`OPERATION_nior_n',`
  define(`func',    `mpn_nior_n')
  define(`POSTOP',  `vmvn	$1, $1')
  define(`LOGOP',   `vorr	$1, $2, $3')')
ifdef(`OPERATION_xor_n',`
  define(`func',    `mpn_xor_n')
  define(`LOGOP',   `veor	$1, $2, $3')')
ifdef(`OPERATION_xnor_n',`
  define(`func',    `mpn_xnor_n')
  define(`POSTOP',  `vmvn	$1, $1')
  define(`LOGOP',   `veor	$1, $2, $3')')

MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)

ASM_START()
	.fpu	neon
PROLOGUE(func)
	cmp		n, #15
	ble		L(bc)

C Copy until rp is 256-bit aligned
	tst		rp, #4
	beq		L(al1)
	vld1.32		{d0[0]}, [up]!
	vld1.32		{d1[0]}, [vp]!
	sub		n, n, #1
	LOGOP(		d0, d0, d1)
	POSTOP(		d0, d0)
	vst1.32		{d0[0]}, [rp]!
L(al1):	tst		rp, #8
	beq		L(al2)
	vld1.32		{d0}, [up]!
	vld1.32		{d1}, [vp]!
	sub		n, n, #2
	LOGOP(		d0, d0, d1)
	POSTOP(		d0, d0)
	vst1.32		{d0}, [rp:64]!
L(al2):	tst		rp, #16
	beq		L(al3)
	vld1.32		{q0}, [up]!
	vld1.32		{q1}, [vp]!
	subs		n, n, #4
	LOGOP(		q0, q0, q1)
	POSTOP(		q0, q0)
	vst1.32		{q0}, [rp:128]!
L(al3):	vld1.32		{q8-q9}, [up]!
	vld1.32		{q10-q11}, [vp]!
	subs		n, n, #8+16
	blt		L(end)

	ALIGN(16)
L(top):	vld1.32		{q0-q1}, [up]!
	LOGOP(		q8, q8, q10)
	vld1.32		{q2-q3}, [vp]!
	LOGOP(		q9, q9, q11)
	POSTOP(		q8, q8)
	POSTOP(		q9, q9)
	vst1.32		{q8-q9}, [rp:256]!
	subs		n, n, #16
	vld1.32		{q8-q9}, [up]!
	LOGOP(		q0, q0, q2)
	vld1.32		{q10-q11}, [vp]!
	LOGOP(		q1, q1, q3)
	POSTOP(		q0, q0)
	POSTOP(		q1, q1)
	vst1.32		{q0-q1}, [rp:256]!
	bge		L(top)

L(end):	LOGOP(		q8, q8, q10)
	LOGOP(		q9, q9, q11)
	POSTOP(		q8, q8)
	POSTOP(		q9, q9)
	vst1.32		{q8-q9}, [rp:256]!

C Copy last 0-15 limbs.  Note that rp is aligned after loop, but not when we
C arrive here via L(bc)
L(bc):	tst		n, #8
	beq		L(tl0)
	vld1.32		{q0-q1}, [up]!
	vld1.32		{q2-q3}, [vp]!
	LOGOP(		q0, q0, q2)
	LOGOP(		q1, q1, q3)
	POSTOP(		q0, q0)
	POSTOP(		q1, q1)
	vst1.32		{q0-q1}, [rp]!
L(tl0):	tst		n, #4
	beq		L(tl1)
	vld1.32		{q0}, [up]!
	vld1.32		{q1}, [vp]!
	LOGOP(		q0, q0, q1)
	POSTOP(		q0, q0)
	vst1.32		{q0}, [rp]!
L(tl1):	tst		n, #2
	beq		L(tl2)
	vld1.32		{d0}, [up]!
	vld1.32		{d1}, [vp]!
	LOGOP(		d0, d0, d1)
	POSTOP(		d0, d0)
	vst1.32		{d0}, [rp]!
L(tl2):	tst		n, #1
	beq		L(tl3)
	vld1.32		{d0[0]}, [up]!
	vld1.32		{d1[0]}, [vp]!
	LOGOP(		d0, d0, d1)
	POSTOP(		d0, d0)
	vst1.32		{d0[0]}, [rp]!
L(tl3):	bx		lr
EPILOGUE()


More information about the gmp-devel mailing list