Neon mul_basecase
Richard Henderson
rth at twiddle.net
Mon Feb 25 07:04:40 CET 2013
I'm not 100% sure how to interpret the speed results, since they're not
really in the same units as before. But comparing to mpn_mul_2 they're
not encouraging. Am I doing something wrong with the testing or did the
1.64 cyc/limb I got for addmul_8 not really come over to this test?
But at least it pases make check...
> $ ./speed -C -s 1-400 -f 1.5 mpn_mul_2
> clock_gettime is 1.000ns accurate
> overhead 399.46 cycles, precision 10000 units of 1.00e-09 secs, CPU freq 1694.10 MHz
> mpn_mul_2
> 1 n/a
> 2 637.7681
> 3 484.2773
> 4 407.5081
> 6 418.4830
> 9 329.4083
> 13 338.0055
> 19 307.1374
> 28 221.8464
> 42 223.5204
> 63 283.4794
> 94 199.7416
> 141 150.1862
> 211 114.0748
> 316 91.5833
> [rth at fremont tune]$ ./speed -C -s 1-400 -f 1.5 mpn_mul_basecase
> clock_gettime is 1.000ns accurate
> overhead 405.05 cycles, precision 10000 units of 1.00e-09 secs, CPU freq 1694.10 MHz
> mpn_mul_basecase
> 1 1890.2391
> 2 1072.0477
> 3 1517.6313
> 4 1505.7726
> 6 1494.0550
> 9 1062.7654
> 13 1427.9960
> 19 1177.6670
> 28 1010.8937
> 42 1018.4768
> 63 1101.3801
> 94 1353.1714
> 141 1836.2722
> 211 2595.3451
> 316 3823.3425
P.s. this version incorporates none of what we talked about this
afternoon. I just wanted to have a working starting point with which to
compare.
r~
-------------- next part --------------
dnl ARM neon mpn_addmul_14.
dnl
dnl Copyright 2013 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
.fpu neon
.arch armv6t2
.arm
C cycles/limb
C Matching the push below.
define(`POPFRAME', `pop { r4, r5, r6, r7, r8, lr }')
define(`POPRETURN', `pop { r4, r5, r6, r7, r8, pc }')
ASM_START()
PROLOGUE(mpn_mul_basecase)
.cfi_startproc
push { r4, r5, r6, r7, r8, lr }
.cfi_adjust_cfa_offset 24
.cfi_rel_offset r4, 0
.cfi_rel_offset r5, 4
.cfi_rel_offset r6, 8
.cfi_rel_offset r7, 12
.cfi_rel_offset r8, 16
.cfi_rel_offset lr, 20
C Save away the parameters, but we'll always try to keep the
C normal parameter registers up to date for the next call.
ldr r8, [sp, #24] @ vn
mov r4, r0 @ rp
mov r5, r1 @ up
mov r6, r2 @ un
mov r7, r3 @ vp
C Defer to other routines for the initial multiply sans accumulate.
C Arrange for VN to be even after this.
tst r8, #1
beq 1f
ldr r3, [r3]
bl mpn_mul_1
str r0, [r4, r6, lsl #2]
add r4, r4, #4
add r7, r7, #4
sub r8, r8, #1
b 2f
1:
bl mpn_mul_2
add ip, r4, r6, lsl #2
add r4, r4, #8
add r7, r7, #8
sub r8, r8, #2
str r0, [ip, #4]
2:
mov r0, r4 @ setup args for next addmul call
mov r1, r5
mov r2, r6
mov r3, r7
C Use a special addmul_14 until we've only a few left.
subs r8, r8, #14 @ vn >= 14?
blt 4f
3:
bl mulbc_addmul_14
subs r8, r8, #14
add r4, r4, #14*4
add r7, r7, #14*4
mov r0, r4 @ setup args for next addmul call
mov r1, r5
mov r2, r6
mov r3, r7
bge 3b
C Tablejump to select the logic for the last 0-12.
4: add r8, r8, #14
ldr ip, [pc, r8, lsl #1]
5: add pc, pc, ip
.word 0f - 5b - 8
.word 2f - 5b - 8
.word 4f - 5b - 8
.word 6f - 5b - 8
.word 8f - 5b - 8
.word 10f - 5b - 8
.word 12f - 5b - 8
4: bl L(do_addmul_2)
2: bl L(do_addmul_2)
0: POPRETURN
10: bl L(do_addmul_2)
8: POPFRAME
b mulbc_addmul_8
12: bl mulbc_addmul_6
add r0, r4, #6*4
mov r1, r5
mov r2, r6
add r3, r7, #6*4
6: POPFRAME
b mulbc_addmul_6
L(do_addmul_2):
mov r8, lr
bl mpn_addmul_2
add ip, r4, r6, lsl #2
add r4, r4, #8
add r7, r7, #8
str r0, [ip, #4]
mov r0, r4
mov r1, r5
mov r2, r6
mov r3, r7
bx r8
.cfi_endproc
EPILOGUE()
C
C These addmul_N are like the standard set, except that they do NOT
C return the last limb. They store it in RP like all the rest.
C
define(`Dv01', `d0') C v2si of two multiplier limbs
define(`Dv23', `d1') C ... 2nd
define(`Dv45', `d2') C ... 3rd
define(`Dv67', `d3') C ... 4th
define(`Dv89', `d4') C ... 4th
define(`DvAB', `d5') C ... 4th
define(`DvCD', `d6') C ... 4th
define(`Du0', `d7') C v2si of one replicated multiplicand limb
define(`Qc01', `q8') C v2di of product or carry
define(`Qc23', `q9') C ... 2nd
define(`Qc45', `q10') C ... 3rd
define(`Qc67', `q11') C ... 4th
define(`Qc89', `q12') C ... 5th
define(`QcAB', `q13') C ... 6th
define(`QcCD', `q14') C ... 7th
define(`Dc0', `d16') C v1di (scalar) aliases of Qc01
define(`Dc1', `d17')
define(`Dc2', `d18')
define(`Dc3', `d19')
define(`Dc4', `d20')
define(`Dc5', `d21')
define(`Dc6', `d22')
define(`Dc7', `d23')
define(`Dc8', `d24')
define(`Dc9', `d25')
define(`DcA', `d26')
define(`DcB', `d27')
define(`DcC', `d28')
define(`DcD', `d29')
define(`QaN', `q15') C v4si with [0] containing next addend limb
define(`DaN', `d30')
.cfi_startproc
TYPE(mulbc_addmul_6, function)
ALIGN(16)
mulbc_addmul_6:
vld1.32 {Dc0, Dc1, Dc2}, [r0] @ load 6 rp limbs
vld1.32 {Dv01, Dv23, Dv45}, [r3] @ load all vp limbs
add r3, r0, #6*4
vmov.i32 QaN, #0 @ clear addend input
vmovl.u32 Qc45, Dc2 @ extend rp limbs to carry-in
vmovl.u32 Qc23, Dc1
vmovl.u32 Qc01, Dc0
subs r2, r2, #7 @ 6, 7, or 8 limbs?
ble L(small_6)
vld1.32 {Du0}, [r1]! @ load 2 up limbs
vld1.32 {DaN}, [r3]! @ load a6 and a7
b L(entry_6)
C Rotated main loop body. Processing 2 UP limbs.
ALIGN(16)
L(loop_6):
vld1.32 {Du0}, [r1]! @ load next two up limbs
vst1.u32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, QaN, #1
vld1.32 {DaN}, [r3]! @ load next two rp limbs
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
C Rotated main loop entry.
L(entry_6):
vmlal.u32 Qc01, Dv01, Du0[0] @ compute u0 products
vmlal.u32 Qc23, Dv23, Du0[0]
vmlal.u32 Qc45, Dv45, Du0[0]
vst1.u32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, QaN, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vmlal.u32 Qc01, Dv01, Du0[1] @ compute u1 products
vshr.u64 DaN, DaN, #32
vmlal.u32 Qc23, Dv23, Du0[1]
subs r2, r2, #2 @ at least 8 limbs left?
vmlal.u32 Qc45, Dv45, Du0[1]
bgt L(loop_6)
C Rotated main loop tail.
vst1.u32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, QaN, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vmov.i32 DaN, #0 @ no more rp limbs
L(small_6):
C We have 6 or 7 limbs left to process. Handle them one at a time.
vld1.32 {Du0[0]}, [r1]!
vmov.i32 DaN, #0 @ zero accumulate
add r2, r2, #7 @ undo bias of n
bne L(single_entry) @ flags still set for N==7
vld1.32 {DaN[0]}, [r3]! @ load last addend limb
b L(single_entry)
C Rotated single multiply loop body, excluding accumulate.
ALIGN(16)
L(single_loop):
vld1.32 {Du0[0]}, [r1]!
vst1.u32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, QaN, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vmov.i32 DaN, #0 @ zero accumulate
L(single_entry):
vmlal.u32 Qc01, Dv01, Du0[0] @ compute u0 products
subs r2, r2, #1
vmlal.u32 Qc23, Dv23, Du0[0]
vmlal.u32 Qc45, Dv45, Du0[0]
bne L(single_loop)
C Rotated single multiply loop body. Normally we'd have a copy of
C the loop body, followed by a branch to finish_6 to tidy up all of
C the carries. Except that finish_7 is exactly that, so long as
C we ensure that Dc7 is zero.
vmov.i32 Qc67, #0
b L(finish_7)
SIZE(mulbc_addmul_6, `.-mulbc_addmul_6')
TYPE(mulbc_addmul_8, function)
ALIGN(16)
mulbc_addmul_8:
vld1.32 {Dc0, Dc1, Dc2, Dc3}, [r0] @ load 8 rp limbs
vld1.32 {Dv01, Dv23, Dv45, Dv67}, [r3] @ load all vp limbs
cmp r2, #8 @ more than 8 limbs?
vld1.32 {Du0[0]}, [r1]! @ load first up
ldrgt r3, [r0, #8*4] @ maybe load 9th rp
ldr ip, [r1], #4 @ load 2nd up limb
moveq r3, #0 @ maybe zero 9th rp
vmov.i32 QaN, #0 @ clear addend input
vmovl.u32 Qc67, Dc3 @ extend rp limbs to carry-in
vmovl.u32 Qc45, Dc2
vmovl.u32 Qc23, Dc1
vmovl.u32 Qc01, Dc0
.balign 16
@ Main loop -- 8 x 1 multiplications
.Loop:
vmlal.u32 Qc01, Dv01, Du0[0] @ compute all products
vmov.32 DaN[0], r3 @ move 9th rp into place
vmlal.u32 Qc23, Dv23, Du0[0]
cmp r2, #9 @ more than 8 limbs left?
vmlal.u32 Qc45, Dv45, Du0[0]
ldrgt r3, [r0, #36] @ maybe load next rp limb
vmlal.u32 Qc67, Dv67, Du0[0]
movle r3, #0 @ maybe zero next rp limb
vmov.32 Du0[0], ip @ move next up limb into place
subs r2, r2, #1
vst1.u32 {Dc0[0]}, [r0]! @ store lowest product limb
ldrne ip, [r1], #4 @ maybe load next up limb
vext.32 Qc01, Qc01, Qc23, #1 @ rotate carry down one
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, QaN, #1 @ rotate in 9th addend or 0.
vpaddl.u32 Qc01, Qc01 @ fold carries
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
bne .Loop
@ Done with all products. Finish propagating carries,
b L(finish_8)
SIZE(mulbc_addmul_8, `.-mulbc_addmul_8')
TYPE(mulbc_addmul_14, function)
ALIGN(16)
mulbc_addmul_14:
vld1.32 {Dv01, Dv23, Dv45, Dv67}, [r3]! @ load 8 vp limbs
vld1.32 {Dv89, DvAB, DvCD}, [r3] @ load 6 more vp
add r3, r0, #32
vld1.32 {Dc0, Dc1, Dc2, Dc3}, [r0] @ load 8 rp limbs
vld1.32 {Dc4, Dc5, Dc6}, [r3] @ load 6 more rp
cmp r2, #14 @ more than 14 limbs?
vld1.32 {Du0[0]}, [r1]! @ load first up
ldrgt r3, [r0, #14*4] @ maybe load 15th rp
ldr ip, [r1], #4 @ load 2nd up limb
moveq r3, #0 @ maybe zero 15th rp
vmov.i32 QaN, #0 @ clear addend input
vmovl.u32 QcCD, Dc6 @ extend rp limbs to carry-in
vmovl.u32 QcAB, Dc5
vmovl.u32 Qc89, Dc4
vmovl.u32 Qc67, Dc3
vmovl.u32 Qc45, Dc2
vmovl.u32 Qc23, Dc1
vmovl.u32 Qc01, Dc0
ALIGN(16)
@ Main loop -- 14 x 1 multiplications
L(loop_14):
vmlal.u32 Qc01, Dv01, Du0[0] @ compute all products
vmov.32 DaN[0], r3 @ move 9th rp into place
vmlal.u32 Qc23, Dv23, Du0[0]
cmp r2, #14+1 @ more than 8 limbs left?
vmlal.u32 Qc45, Dv45, Du0[0]
ldrgt r3, [r0, #(14+1)*4] @ maybe load next rp limb
vmlal.u32 Qc67, Dv67, Du0[0]
movle r3, #0 @ maybe zero next rp limb
vmlal.u32 Qc89, Dv89, Du0[0]
subs r2, r2, #1
vmlal.u32 QcAB, DvAB, Du0[0]
vmlal.u32 QcCD, DvCD, Du0[0]
vmov.32 Du0[0], ip @ move next up limb into place
vst1.u32 {Dc0[0]}, [r0]! @ store lowest product limb
ldrne ip, [r1], #4 @ maybe load next up limb
vext.32 Qc01, Qc01, Qc23, #1 @ rotate carry down one
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, Qc89, #1
vext.32 Qc89, Qc89, QcAB, #1
vext.32 QcAB, QcAB, QcCD, #1
vext.32 QcCD, QcCD, QaN, #1 @ rotate in next addend or 0.
vpaddl.u32 Qc01, Qc01 @ fold carries
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
vpaddl.u32 Qc89, Qc89
vpaddl.u32 QcAB, QcAB
vpaddl.u32 QcCD, QcCD
bne L(loop_14)
@ Done with all products. Finish propagating carries,
L(finish_14):
vst1.u32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, Qc89, #1
vext.32 Qc89, Qc89, QcAB, #1
vext.32 QcAB, QcAB, QcCD, #1
vext.32 DcC, DcC, DcD, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
vpaddl.u32 Qc89, Qc89
vpaddl.u32 QcAB, QcAB
vpaddl.u32 DcC, DcC
L(finish_13):
vst1.u32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, Qc89, #1
vext.32 Qc89, Qc89, QcAB, #1
vext.32 QcAB, QcAB, QcCD, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
vpaddl.u32 Qc89, Qc89
vpaddl.u32 QcAB, QcAB
L(finish_12):
vst1.u32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, Qc89, #1
vext.32 Qc89, Qc89, QcAB, #1
vext.32 DcA, DcA, DcB, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
vpaddl.u32 Qc89, Qc89
vpaddl.u32 DcA, DcA
L(finish_11):
vst1.u32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, Qc89, #1
vext.32 Qc89, Qc89, QcAB, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
vpaddl.u32 Qc89, Qc89
L(finish_10):
vst1.u32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, Qc89, #1
vext.32 Dc8, Dc8, Dc9, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
vpaddl.u32 Dc8, Dc8
L(finish_9):
vst1.u32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Qc67, Qc67, Qc89, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Qc67, Qc67
L(finish_8):
vst1.32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vext.32 Dc6, Dc6, Dc7, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
vpaddl.u32 Dc6, Dc6
L(finish_7):
vst1.32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Qc45, Qc45, Qc67, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Qc45, Qc45
L(finish_6):
vst1.32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vext.32 Dc4, Dc4, Dc5, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
vpaddl.u32 Dc4, Dc4
L(finish_5):
vst1.32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Qc23, Qc23, Qc45, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Qc23, Qc23
L(finish_4):
vst1.32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vext.32 Dc2, Dc2, Dc3, #1
vpaddl.u32 Qc01, Qc01
vpaddl.u32 Dc2, Dc2
L(finish_3):
vst1.32 {Dc0[0]}, [r0]!
vext.32 Qc01, Qc01, Qc23, #1
vpaddl.u32 Qc01, Qc01
L(finish_2):
vst1.32 {Dc0[0]}, [r0]!
vext.32 Dc0, Dc0, Dc1, #1
vpaddl.u32 Dc0, Dc0
L(finish_1):
vst1.32 {Dc0[0]}, [r0]!
bx lr
SIZE(mulbc_addmul_14, `.-mulbc_addmul_14')
.cfi_endproc
More information about the gmp-devel
mailing list