[PATCH] Optimize RISC-V mpn_add_n/mpn_sub_n
camel-cdr
camel-cdr at protonmail.com
Tue Jun 2 22:06:48 CEST 2026
Hi,
I have a faster mpn_add_n/mpn_sub_n implementation, that
uses the branch predictor to break the carry dependency
chain by predicting that the carry-in doesn't produce a
carry.
This means that the implementation isn't constant-time, but
afaik libgmp doesn't guarantee constant time, so I think it
should be fine to add. Please correct me if I'm wrong.
For constant time, you can still get to 2 cycles/limb for
add by keeping the two carries separate and beyond 1
cycle/limb using RVV.
Here are benchmark results on various RISC-V implementations:
gmp-6.3 vs patch cycles/limb:
addition subtraction
gmp patch| gmp patch
XuanTie-C908: 4.90 4.64 | 4.92 4.85 (dual-issue in-order)
SpacemiT-X60: 4.78 4.20 | 4.78 4.28 (dual-issue in-order)
SpacemiT-A100: 6.04 4.04 | 5.04 4.30 (dual-issue in-order)
XuanTie-C910: 3.59 2.79 | 3.36 2.79 (3-wide out-of-order)
SpacemiT-X100: 3.31 2.32 | 2.39 2.31 (4-wide out-of-order)
RTL simulation of open-source processors:
RocketChip: 9.33 8.32 | 9.34 8.32 (1-issue in-order)
Shuttle: 5.57 4.82 | 5.62 4.59 (2-issue in-order)
Shuttle3: 4.34 3.82 | 4.33 4.07 (3-issue in-order)
MediumBoomV3: 4.78 4.20 | 4.78 4.19 (2-wide out-of-order)
LargeBoomV3: 4.03 3.04 | 4.03 3.04 (3-wide out-of-order)
MegaBoomV3: 3.03 2.11 | 2.37 2.11 (4-wide out-of-order)
XiangShan
KunminghuV3: 3.01 1.08 | 2.03 1.08 (8-wide out-of-order)
The code is at the end of the message. I haven't
contributed to libgmp before, so idk what the regular
procedure is, I hope this is alright.
I optimized the code to run well on dual-issue in-order CPUs
with support for load/store pair fusion (X60/A100) or
ALU+load/store dual-issue (C908/Shuttle).
To optimize for both simultaneously, I interleave
independent load+load and ALU+ALU pairs, such that a
processor can either execute
...,MEM+MEM,ALU+ALU,... or
...,MEM+ALU,ALU+MEM,...
The "dual" macro is used as a visual.
I also have faster mpn_add_n and mpn_mul_n implementations
using the RISC-V vector extension (RVV). But those are
currently in C intrinsics and I still have to write the
assembly and add overlapping scalar execution to get the
best performance. My plan is to write RVV implementations
for the basic mpn primitives at some point.
- Olaf
===== aside =====
The same carry-breaking trick can also be used in ISAs with
carry flag, instead of doing it for every carry it's
probably better to do it every four carries or larger.
I didn't get any speed up on Zen1 or Zen4 with the code I tried.
On Arm I got a slowdown on Cortex-A55 and A78, the same
performance on Cortex-X1, but on the Apple M3 I got a
speedup from 1 cycle/limb 0.72 cycles/limb. If anybody wants
to reproduce this, I replaced the first of the 4x unrolled
adcs with:
adcs rd, arg1, arg2
adds tmp, arg1, arg2
cmp tmp, #-1
b.eq fixup
I didn't actually implement fixup, because that wasn't
necessary for my test, but it should be quite straight
forward.
One thing to note is that on the very wide x86 and Arm
processors, carry-setting ALU instructions usually only have
half the throughput of regular ALU instructions. So I also
tried to emulate the execution the RISC-V code on the M3, by
using add for add, xor for sltu (because there was no
equivalent) and cbz instead of bltu. This obviously produces
the wrong result, but simulates the same resource usage
you'd expect from a similarly wide RISC-V core.
The result was 0.85 cycle/limb.
===== mpn/riscv/64/aors_n.asm =====
include(`../config.m4')
ifdef(`OPERATION_add_n',` define(`func', `mpn_add_n') ')
ifdef(`OPERATION_sub_n',` define(`func', `mpn_sub_n') ')
MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
C input parameters
define(`dp', `a0') C dest pointer
define(`lp', `a1') C left pointer
define(`rp', `a2') C right pointer
define(`nn', `t1') C limb count
C carries
define(`c1', `t0') C primary carry, even iterations
define(`c2', `t2') C primary carry, odd iterations
define(`c3', `a6') C secondary carry
C left/right limbs
define(`l0', `a3')
define(`r0', `a4')
define(`l1', `t3')
define(`r1', `a5')
define(`l2', `t4')
define(`r2', `t5')
define(`l3', `t6')
define(`r3', `a7')
C visual guide used to mark two independent instructions
define(`dual',
`$1
$2')
ASM_START()
PROLOGUE(func)
andi a4, a3, 1
srli nn, a3, 2
andi a3, a3, 2
bnez a4, L(bx1)
L(bx0):
bnez a3, L(b10)
L(b00):
dual(` ld l0, 0(lp)',` ld l1, 8(lp)')
dual(` ld r0, 0(rp)',` ld r1, 8(rp)')
ifdef(`OPERATION_add_n',`
dual(` addi nn, nn, -1',` add l0, l0, r0')
dual(` sltu c2, l0, r0',` add l1, l1, r1')
dual(` sltu c1, l1, r1',` add r1, l1, c2')
dual(` sd l0, 0(dp)',` sd r1, 8(dp)')
sltu c3, r1, l1
') ifdef(`OPERATION_sub_n',`
dual(` addi nn, nn, -1',` sltu c2, l0, r0')
dual(` sltu c1, l1, r1',` sub l1, l1, r1')
dual(` sub l0, l0, r0',` sub r1, l1, c2')
dual(` sd l0, 0(dp)',` sd r1, 8(dp)')
sltu c3, l1, c2
')
dual(` ld l0, 16(lp)',` ld r0, 16(rp)')
dual(` ld l1, 24(lp)',` ld r1, 24(rp)')
bnez nn, L(loop4)
j L(final2)
L(b11):
dual(` ld l1, 0(lp)',` ld l0, 8(lp)')
dual(` ld r1, 0(rp)',` ld r0, 8(rp)')
dual(` addi lp, lp, -8',` addi rp, rp, -8')
ifdef(`OPERATION_add_n',`
dual(` add l1, l1, r1',` sd l1, 0(dp)')
dual(` sltu c1, l1, r1',` ld l1, 24(lp)')
') ifdef(`OPERATION_sub_n',`
dual(` sltu c1, l1, r1',` sub l1, l1, r1')
dual(` sd l1, 0(dp)',` ld l1, 24(lp)')
')
dual(` addi dp, dp, -8',` ld r1, 24(rp)')
dual(` li c3, 0',` bnez nn, L(loop4)')
j L(final2)
L(b10):
dual(` ld l0, 0(lp)',` ld l1, 8(lp)')
dual(` li c1, 0',` li c3, 0')
dual(` ld r0, 0(rp)',` ld r1, 8(rp)')
dual(` addi dp, dp, -16',` addi lp, lp, -16')
dual(` addi rp, rp, -16',` bnez nn, L(loop4)')
j L(final2)
L(bx1):
bnez a3, L(b11)
L(b01):
ld l0, 0(lp)
ld r0, 0(rp)
ifdef(`OPERATION_add_n',`
add l0, l0, r0
sltu c1, l0, r0
') ifdef(`OPERATION_sub_n',`
sltu c1, l0, r0
sub l0, l0, r0
')
sd l0, 0(dp)
bnez nn, 1f
mv a0, c1
ret
1:
dual(` ld l2, 8(lp)',` ld l3, 16(lp)')
dual(` addi nn, nn, -1',` addi dp, dp, 8')
dual(` ld r0, 8(rp)',` ld r1, 16(rp)')
dual(` addi lp, lp, 8',` addi rp, rp, 8')
ifdef(`OPERATION_add_n',`
dual(` add l2, l2, r0',` add l3, l3, r1')
dual(` sltu c2, l2, r0',` add r0, l2, c1')
dual(` ld l0, 16(lp)',` ld l1, 24(lp)')
dual(` sltu c1, l3, r1',` sltu c3, r0, l2')
add r1, l3, c2
add r1, r1, c3
dual(` sd r0, 0(dp)',` sd r1, 8(dp)')
sltu c3, r1, l3
') ifdef(`OPERATION_sub_n',`
dual(` sltu c2, l2, r0',` sub l2, l2, r0')
dual(` sltu c3, l2, c1',` sub r0, l2, c1')
dual(` sltu c1, l3, r1',` sub l3, l3, r1')
dual(` ld l0, 16(lp)',` ld l1, 24(lp)')
sub r1, l3, c2
sub r1, r1, c3
dual(` sd r0, 0(dp)',` sd r1, 8(dp)')
sltu c3, l3, r1
')
dual(` ld r0, 16(rp)',` ld r1, 24(rp)')
beqz nn, L(final2)
j L(loop4)
.p2align 4
L(loop4):
or c1, c1, c3
L(top):
ifdef(`OPERATION_add_n',`
dual(` ld l2, 32(lp)',` ld l3, 40(lp)')
dual(` add l0, l0, r0',` add l1, l1, r1')
dual(` ld r2, 32(rp)',` ld r3, 40(rp)')
dual(` sltu c2, l0, r0',` add r0, l0, c1')
dual(` addi nn, nn, -1',` bltu r0, l0, L(prop1)')
1: dual(` sltu c1, l1, r1',` add r1, l1, c2')
dual(` sd r0, 16(dp)',` sd r1, 24(dp)')
dual(` addi dp, dp, 32',` bltu r1, l1, L(prop2)')
2: dual(` add l2, l2, r2',` add l3, l3, r3')
dual(` sltu c2, l2, r2',` add r2, l2, c1')
dual(` ld l0, 48(lp)',` ld l1, 56(lp)')
bltu r2, l2, L(prop3)
3: dual(` sltu c1, l3, r3',` add r3, l3, c2')
dual(` ld r0, 48(rp)',` ld r1, 56(rp)')
bltu r3, l3, L(prop4)
4: dual(` addi lp, lp, 32',` addi rp, rp, 32')
dual(` sd r2, 0(dp)',` sd r3, 8(dp)')
') ifdef(`OPERATION_sub_n',`
dual(` ld l2, 32(lp)',` ld l3, 40(lp)')
dual(` sltu c2, l0, r0',` sub l0, l0, r0')
dual(` ld r2, 32(rp)',` ld r3, 40(rp)')
dual(` sub r0, l0, c1',` bltu l0, c1, L(prop1)')
1: dual(` sltu c1, l1, r1',` sub l1, l1, r1')
dual(` addi nn, nn, -1',` addi rp, rp, 32')
dual(` sub r1, l1, c2',` bltu l1, c2, L(prop2)')
2: dual(` sltu c2, l2, r2',` sub l2, l2, r2')
dual(` sd r0, 16(dp)',` sd r1, 24(dp)')
dual(` sub r2, l2, c1',` bltu l2, c1, L(prop3)')
3: dual(` sltu c1, l3, r3',` sub l3, l3, r3')
dual(` ld r0, 16(rp)',` ld r1, 24(rp)')
dual(` sub r3, l3, c2',` bltu l3, c2, L(prop4)')
4: dual(` ld l0, 48(lp)',` ld l1, 56(lp)')
dual(` addi dp, dp, 32',` addi lp, lp, 32')
dual(` sd r2, 0(dp)',` sd r3, 8(dp)')
')
bnez nn, L(top)
li c3, 0
L(final2):
ifdef(`OPERATION_add_n',`
dual(` add l0, l0, r0',` add l1, l1, r1')
dual(` sltu c2, l0, r0',` add r0, l0, c1')
dual(` add r0, r0, c3',` sltu c1, l1, r1')
dual(` sltu c3, r0, l0',` add r1, l1, c2')
add r1, r1, c3
sltu c3, r1, l1
') ifdef(`OPERATION_sub_n',`
dual(` sltu c2, l0, r0',` sub l0, l0, r0')
dual(` sub r0, l0, c1',` sltu c1, l1, r1')
dual(` sub r0, r0, c3',` sub l1, l1, r1')
dual(` sltu c3, l0, r0',` sub r1, l1, c2')
sub r1, r1, c3
sltu c3, l1, r1
')
dual(` sd r0, 16(dp)',` sd r1, 24(dp)')
or a0, c1, c3
ret
EPILOGUE()
L(prop1):
li c2, 1
j 1b
L(prop2):
li c1, 1
j 2b
L(prop3):
li c2, 1
j 3b
L(prop4):
li c1, 1
j 4b
ASM_END()
More information about the gmp-devel
mailing list