[PATCH] Optimize RISC-V mpn_add_n/mpn_sub_n

Tue Jun 2 22:06:48 CEST 2026

Hi,

I have a faster mpn_add_n/mpn_sub_n implementation, that
uses the branch predictor to break the carry dependency
chain by predicting that the carry-in doesn't produce a
carry.
This means that the implementation isn't constant-time, but
afaik libgmp doesn't guarantee constant time, so I think it
should be fine to add. Please correct me if I'm wrong.
For constant time, you can still get to 2 cycles/limb for
add by keeping the two carries separate and beyond 1
cycle/limb using RVV.

Here are benchmark results on various RISC-V implementations:

gmp-6.3 vs patch cycles/limb:
               addition    subtraction
               gmp  patch| gmp  patch
XuanTie-C908:  4.90 4.64 | 4.92 4.85 (dual-issue in-order)
SpacemiT-X60:  4.78 4.20 | 4.78 4.28 (dual-issue in-order)
SpacemiT-A100: 6.04 4.04 | 5.04 4.30 (dual-issue in-order)
XuanTie-C910:  3.59 2.79 | 3.36 2.79 (3-wide out-of-order)
SpacemiT-X100: 3.31 2.32 | 2.39 2.31 (4-wide out-of-order)
RTL simulation of open-source processors:
RocketChip:    9.33 8.32 | 9.34 8.32 (1-issue in-order)
Shuttle:       5.57 4.82 | 5.62 4.59 (2-issue in-order)
Shuttle3:      4.34 3.82 | 4.33 4.07 (3-issue in-order)
MediumBoomV3:  4.78 4.20 | 4.78 4.19 (2-wide out-of-order)
LargeBoomV3:   4.03 3.04 | 4.03 3.04 (3-wide out-of-order)
MegaBoomV3:    3.03 2.11 | 2.37 2.11 (4-wide out-of-order)
XiangShan
KunminghuV3:   3.01 1.08 | 2.03 1.08 (8-wide out-of-order)

The code is at the end of the message. I haven't
contributed to libgmp before, so idk what the regular
procedure is, I hope this is alright.

I optimized the code to run well on dual-issue in-order CPUs
with support for load/store pair fusion (X60/A100) or
ALU+load/store dual-issue (C908/Shuttle).
To optimize for both simultaneously, I interleave
independent load+load and ALU+ALU pairs, such that a
processor can either execute
    ...,MEM+MEM,ALU+ALU,... or
    ...,MEM+ALU,ALU+MEM,...
The "dual" macro is used as a visual.

I also have faster mpn_add_n and mpn_mul_n implementations
using the RISC-V vector extension (RVV). But those are
currently in C intrinsics and I still have to write the
assembly and add overlapping scalar execution to get the
best performance. My plan is to write RVV implementations
for the basic mpn primitives at some point.

- Olaf

===== aside =====

The same carry-breaking trick can also be used in ISAs with
carry flag, instead of doing it for every carry it's
probably better to do it every four carries or larger.

I didn't get any speed up on Zen1 or Zen4 with the code I tried.
On Arm I got a slowdown on Cortex-A55 and A78, the same
performance on Cortex-X1, but on the Apple M3 I got a
speedup from 1 cycle/limb 0.72 cycles/limb. If anybody wants
to reproduce this, I replaced the first of the 4x unrolled
adcs with:

    adcs rd, arg1, arg2
    adds tmp, arg1, arg2
    cmp tmp, #-1
    b.eq fixup

I didn't actually implement fixup, because that wasn't
necessary for my test, but it should be quite straight
forward.

One thing to note is that on the very wide x86 and Arm
processors, carry-setting ALU instructions usually only have
half the throughput of regular ALU instructions. So I also
tried to emulate the execution the RISC-V code on the M3, by
using add for add, xor for sltu (because there was no
equivalent) and cbz instead of bltu. This obviously produces
the wrong result, but simulates the same resource usage
you'd expect from a similarly wide RISC-V core.
The result was 0.85 cycle/limb.

===== mpn/riscv/64/aors_n.asm =====

include(`../config.m4')

ifdef(`OPERATION_add_n',` define(`func',	`mpn_add_n') ')
ifdef(`OPERATION_sub_n',` define(`func',	`mpn_sub_n') ')

MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)

C input parameters
define(`dp',	`a0') C dest pointer
define(`lp',	`a1') C left pointer
define(`rp',	`a2') C right pointer
define(`nn',	`t1') C limb count

C carries
define(`c1',	`t0') C primary carry, even iterations
define(`c2',	`t2') C primary carry, odd iterations
define(`c3',	`a6') C secondary carry

C left/right limbs
define(`l0',	`a3')
define(`r0',	`a4')
define(`l1',	`t3')
define(`r1',	`a5')
define(`l2',	`t4')
define(`r2',	`t5')
define(`l3',	`t6')
define(`r3',	`a7')

C visual guide used to mark two independent instructions
define(`dual',
	`$1
	$2')

ASM_START()
PROLOGUE(func)
	andi	a4, a3, 1
	srli	nn, a3, 2
	andi	a3, a3, 2
	bnez	a4, L(bx1)
L(bx0):
	bnez	a3, L(b10)

L(b00):
	dual(`	ld	l0, 0(lp)',`	ld	l1, 8(lp)')
	dual(`	ld	r0, 0(rp)',`	ld	r1, 8(rp)')
ifdef(`OPERATION_add_n',`
	dual(`	addi	nn, nn, -1',`	add	l0, l0, r0')
	dual(`	sltu	c2, l0, r0',`	add	l1, l1, r1')
	dual(`	sltu	c1, l1, r1',`	add	r1, l1, c2')
	dual(`	sd	l0, 0(dp)',`	sd	r1, 8(dp)')
	sltu	c3, r1, l1
') ifdef(`OPERATION_sub_n',`
	dual(`	addi	nn, nn, -1',`	sltu	c2, l0, r0')
	dual(`	sltu	c1, l1, r1',`	sub	l1, l1, r1')
	dual(`	sub	l0, l0, r0',`	sub	r1, l1, c2')
	dual(`	sd	l0, 0(dp)',`	sd	r1, 8(dp)')
	sltu	c3, l1, c2
')
	dual(`	ld	l0, 16(lp)',`	ld	r0, 16(rp)')
	dual(`	ld	l1, 24(lp)',`	ld	r1, 24(rp)')
	bnez	nn, L(loop4)
	j	L(final2)

L(b11):
	dual(`	ld	l1, 0(lp)',`	ld	l0, 8(lp)')
	dual(`	ld	r1, 0(rp)',`	ld	r0, 8(rp)')
	dual(`	addi	lp, lp, -8',`	addi	rp, rp, -8')
ifdef(`OPERATION_add_n',`
	dual(`	add	l1, l1, r1',`	sd	l1, 0(dp)')
	dual(`	sltu	c1, l1, r1',`	ld	l1, 24(lp)')
') ifdef(`OPERATION_sub_n',`
	dual(`	sltu	c1, l1, r1',`	sub	l1, l1, r1')
	dual(`	sd	l1, 0(dp)',`	ld	l1, 24(lp)')
')
	dual(`	addi	dp, dp, -8',`	ld	r1, 24(rp)')
	dual(`	li	c3, 0',`	bnez	nn, L(loop4)')
	j	L(final2)

L(b10):
	dual(`	ld	l0, 0(lp)',`	ld	l1, 8(lp)')
	dual(`	li	c1, 0',`	li	c3, 0')
	dual(`	ld	r0, 0(rp)',`	ld	r1, 8(rp)')
	dual(`	addi	dp, dp, -16',`	addi	lp, lp, -16')
	dual(`	addi	rp, rp, -16',`	bnez	nn, L(loop4)')
	j	L(final2)

L(bx1):
	bnez	a3, L(b11)
L(b01):
	ld	l0, 0(lp)
	ld	r0, 0(rp)
ifdef(`OPERATION_add_n',`
	add	l0, l0, r0
	sltu	c1, l0, r0
') ifdef(`OPERATION_sub_n',`
	sltu	c1, l0, r0
	sub	l0, l0, r0
')
	sd	l0, 0(dp)
	bnez	nn, 1f
	mv	a0, c1
	ret
1:
	dual(`	ld	l2, 8(lp)',`	ld	l3, 16(lp)')
	dual(`	addi	nn, nn, -1',`	addi	dp, dp, 8')
	dual(`	ld	r0, 8(rp)',`	ld	r1, 16(rp)')
	dual(`	addi	lp, lp, 8',`	addi	rp, rp, 8')
ifdef(`OPERATION_add_n',`
	dual(`	add	l2, l2, r0',`	add	l3, l3, r1')
	dual(`	sltu	c2, l2, r0',`	add	r0, l2, c1')
	dual(`	ld	l0, 16(lp)',`	ld	l1, 24(lp)')
	dual(`	sltu	c1, l3, r1',`	sltu	c3, r0, l2')
	add	r1, l3, c2
	add	r1, r1, c3
	dual(`	sd	r0, 0(dp)',`	sd	r1, 8(dp)')
	sltu	c3, r1, l3
') ifdef(`OPERATION_sub_n',`
	dual(`	sltu	c2, l2, r0',`	sub	l2, l2, r0')
	dual(`	sltu	c3, l2, c1',`	sub	r0, l2, c1')
	dual(`	sltu	c1, l3, r1',`	sub	l3, l3, r1')
	dual(`	ld	l0, 16(lp)',`	ld	l1, 24(lp)')
	sub	r1, l3, c2
	sub	r1, r1, c3
	dual(`	sd	r0, 0(dp)',`	sd	r1, 8(dp)')
	sltu	c3, l3, r1
')
	dual(`	ld	r0, 16(rp)',`	ld	r1, 24(rp)')
	beqz	nn, L(final2)
	j	L(loop4)

.p2align 4
L(loop4):
	or	c1, c1, c3
L(top):
ifdef(`OPERATION_add_n',`
	dual(`	ld	l2, 32(lp)',`	ld	l3, 40(lp)')
	dual(`	add	l0, l0, r0',`	add	l1, l1, r1')
	dual(`	ld	r2, 32(rp)',`	ld	r3, 40(rp)')
	dual(`	sltu	c2, l0, r0',`	add	r0, l0, c1')
	dual(`	addi	nn, nn, -1',`	bltu	r0, l0, L(prop1)')

1:	dual(`	sltu	c1, l1, r1',`	add	r1, l1, c2')
	dual(`	sd	r0, 16(dp)',`	sd	r1, 24(dp)')
	dual(`	addi	dp, dp, 32',`	bltu	r1, l1, L(prop2)')

2:	dual(`	add	l2, l2, r2',`	add	l3, l3, r3')
	dual(`	sltu	c2, l2, r2',`	add	r2, l2, c1')
	dual(`	ld	l0, 48(lp)',`	ld	l1, 56(lp)')
	bltu	r2, l2, L(prop3)

3:	dual(`	sltu	c1, l3, r3',`	add	r3, l3, c2')
	dual(`	ld	r0, 48(rp)',`	ld	r1, 56(rp)')
	bltu	r3, l3, L(prop4)

4:	dual(`	addi	lp, lp, 32',`	addi	rp, rp, 32')
	dual(`	sd	r2, 0(dp)',`	sd	r3, 8(dp)')
') ifdef(`OPERATION_sub_n',`
	dual(`	ld	l2, 32(lp)',`	ld	l3, 40(lp)')
	dual(`	sltu	c2, l0, r0',`	sub	l0, l0, r0')
	dual(`	ld	r2, 32(rp)',`	ld	r3, 40(rp)')
	dual(`	sub	r0, l0, c1',`	bltu	l0, c1, L(prop1)')

1:	dual(`	sltu	c1, l1, r1',`	sub	l1, l1, r1')
	dual(`	addi	nn, nn, -1',`	addi	rp, rp, 32')
	dual(`	sub	r1, l1, c2',`	bltu	l1, c2, L(prop2)')

2:	dual(`	sltu	c2, l2, r2',`	sub	l2, l2, r2')
	dual(`	sd	r0, 16(dp)',`	sd	r1, 24(dp)')
	dual(`	sub	r2, l2, c1',`	bltu	l2, c1, L(prop3)')

3:	dual(`	sltu	c1, l3, r3',`	sub	l3, l3, r3')
	dual(`	ld	r0, 16(rp)',`	ld	r1, 24(rp)')
	dual(`	sub	r3, l3, c2',`     bltu	l3, c2, L(prop4)')

4:	dual(`	ld	l0, 48(lp)',`	ld	l1, 56(lp)')
	dual(`	addi	dp, dp, 32',`	addi	lp, lp, 32')
	dual(`	sd	r2, 0(dp)',`	sd	r3, 8(dp)')
')
	bnez	nn, L(top)
	li	c3, 0

L(final2):
ifdef(`OPERATION_add_n',`
	dual(`	add	l0, l0, r0',`	add	l1, l1, r1')
	dual(`	sltu	c2, l0, r0',`	add	r0, l0, c1')
	dual(`	add	r0, r0, c3',`	sltu	c1, l1, r1')
	dual(`	sltu	c3, r0, l0',`	add	r1, l1, c2')
	add	r1, r1, c3
	sltu	c3, r1, l1
') ifdef(`OPERATION_sub_n',`
	dual(`	sltu	c2, l0, r0',`	sub	l0, l0, r0')
	dual(`	sub	r0, l0, c1',`	sltu	c1, l1, r1')
	dual(`	sub	r0, r0, c3',`	sub	l1, l1, r1')
	dual(`	sltu	c3, l0, r0',`	sub	r1, l1, c2')
	sub	r1, r1, c3
	sltu	c3, l1, r1
')
	dual(`	sd	r0, 16(dp)',`	sd	r1, 24(dp)')
	or	a0, c1, c3
	ret
EPILOGUE()

L(prop1):
	li	c2, 1
	j	1b
L(prop2):
	li	c1, 1
	j	2b
L(prop3):
	li	c2, 1
	j	3b
L(prop4):
	li	c1, 1
	j	4b

ASM_END()