From liushiyu25 at mails.tsinghua.edu.cn  Sat May  9 05:19:13 2026
From: liushiyu25 at mails.tsinghua.edu.cn (liushiyu25 at mails.tsinghua.edu.cn)
Date: Sat, 9 May 2026 11:19:13 +0800 (GMT+08:00)
Subject: AArch64 logops_n NEON optimization
Message-ID: <1d6dfbb8.55453.19e0abf0e55.Coremail.liushiyu25@mails.tsinghua.edu.cn>

Hi,

I have been experimenting with a NEON implementation of the AArch64 mpn_logops_n routines. It uses an 8-limb NEON bulk loop for n > 7, with a shared 4/2/1 tail path for small inputs and residual limbs.

I tested this on my Apple M4 laptop and on a Neoverse N1 system using tune/speed. On Apple M4, I did not observe regressions in the tested small-limb range. For representative large sizes, the old/new ratio was above 1.40x in the same tune/speed setup. 

I have also tried similar NEON-based rewrites for a few other AArch64 mpn routines, such as lshift, rshift, com, copyi, and copyd, and saw useful speedups in my local measurements.  I noticed that GMP has historically had ARM/NEON variants for some related routines, so I would be interested to know whether this direction has been considered before for the current AArch64 code.

I am sending logops_n first because it is the one I am most confident about, and I would like to check whether there are any obvious issues with the approach before looking at other routines.

The m4 file is pasted below, and the implementation passes the GMP test suite in my local setup.

include(`../config.m4')

changecom(blah)

define(`rp', `x0')

define(`up', `x1')

define(`vp', `x2')

define(`n',  `x3')


define(`POSTOP',  `dnl')

define(`VPOSTOP', `dnl')


ifdef(`OPERATION_and_n',`

  define(`func',    `mpn_and_n')

  define(`LOGOP',   `and        $1, $2, $3')

  define(`VLOGOP',  `and        $1.16b, $2.16b, $3.16b')')


ifdef(`OPERATION_andn_n',`

  define(`func',    `mpn_andn_n')

  define(`LOGOP',   `bic        $1, $2, $3')

  define(`VLOGOP',  `bic        $1.16b, $2.16b, $3.16b')')


ifdef(`OPERATION_nand_n',`

  define(`func',    `mpn_nand_n')

  define(`LOGOP',   `and        $1, $2, $3')

  define(`POSTOP',  `mvn        $1, $1')

  define(`VLOGOP',  `and        $1.16b, $2.16b, $3.16b')

  define(`VPOSTOP', `not        $1.16b, $1.16b')')


ifdef(`OPERATION_ior_n',`

  define(`func',    `mpn_ior_n')

  define(`LOGOP',   `orr        $1, $2, $3')

  define(`VLOGOP',  `orr        $1.16b, $2.16b, $3.16b')')


ifdef(`OPERATION_iorn_n',`

  define(`func',    `mpn_iorn_n')

  define(`LOGOP',   `orn        $1, $2, $3')

  define(`VLOGOP',  `orn        $1.16b, $2.16b, $3.16b')')


ifdef(`OPERATION_nior_n',`

  define(`func',    `mpn_nior_n')

  define(`LOGOP',   `orr        $1, $2, $3')

  define(`POSTOP',  `mvn        $1, $1')

  define(`VLOGOP',  `orr        $1.16b, $2.16b, $3.16b')

  define(`VPOSTOP', `not        $1.16b, $1.16b')')


ifdef(`OPERATION_xor_n',`

  define(`func',    `mpn_xor_n')

  define(`LOGOP',   `eor        $1, $2, $3')

  define(`VLOGOP',  `eor        $1.16b, $2.16b, $3.16b')')


ifdef(`OPERATION_xnor_n',`

  define(`func',    `mpn_xnor_n')

  define(`LOGOP',   `eon        $1, $2, $3')

  define(`VLOGOP',  `eor        $1.16b, $2.16b, $3.16b')

  define(`VPOSTOP', `not        $1.16b, $1.16b')')

MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)

ASM_START()

PROLOGUE(func)

        cmp        n, #7

        b.hi        L(large)

L(tail):

        tbz        n, #2, L(tail_2)

        ldp        q0, q1, [up], #32

        ldp        q2, q3, [vp], #32

        VLOGOP(        v0, v0, v2)

        VLOGOP(        v1, v1, v3)

        VPOSTOP(        v0)

        VPOSTOP(        v1)

        stp        q0, q1, [rp], #32

L(tail_2):

        tbz        n, #1, L(tail_1)

        ldp        x4, x5, [up], #16

        ldp        x6, x7, [vp], #16

        LOGOP(        x12, x4, x6)

        LOGOP(        x13, x5, x7)

        POSTOP(        x12)

        POSTOP(        x13)

        stp        x12, x13, [rp], #16

L(tail_1):

        tbz        n, #0, L(ret)

        ldr        x4, [up]

        ldr        x5, [vp]

        LOGOP(        x12, x4, x5)

        POSTOP(        x12)

        str        x12, [rp]

L(ret):

        ret

L(large):

        and        x4, n, #-8

        and        n, n, #7

L(loop):

        ldp        q0, q1, [up], #32

        ldp        q2, q3, [vp], #32

        ldp        q4, q5, [up], #32

        ldp        q6, q7, [vp], #32

        subs        x4, x4, #8

        VLOGOP(        v0, v0, v2)

        VLOGOP(        v1, v1, v3)

        VLOGOP(        v4, v4, v6)

        VLOGOP(        v5, v5, v7)

        VPOSTOP(        v0)

        VPOSTOP(        v1)

        VPOSTOP(        v4)

        VPOSTOP(        v5)

        stp        q0, q1, [rp], #32

        stp        q4, q5, [rp], #32

        b.ne        L(loop)

        cbnz        n, L(tail)

        ret

EPILOGUE()