[PATCH] mpn/{arm,arm64}/mod_34lsub1: Code tweak
David Sparks
sparks05 at proton.me
Sat Feb 14 17:25:05 CET 2026
This is just me playing around. I don't have an ARM system to test
on, so it'll take some careful review and testing, or maybe it'll just
give someone some ideas.
I've avoided changing the carefully-tuned main loops, but improve the
setup/reduce code around them.
ARM32 improvements:
- Avoid using r7, saving one push/pop. Instead, r6 is reused
for the carry.
- For lengths of 1 or 2 limbs, skip push/pop entirely.
- Use shift+add instructions better during the reduction.
This requires masking *before* shifting, which can be
done if we're clever about the immediates. We don't have
to mask off bits that are shifted away.
- The "bics n, n, n, asr #1" instruction is definitely subtle.
Since we're starting with n == -1 or -2 (length 2 or 1, minus 3),
this copies the lsbit of n to the carry flag, and clears n.
The ARM64 code is sufficiently similar that the changes are easily
ported. Two more changes there:
- If the length is a multiple of 3, we can jump straight to the
reduction and skip adding an all-zero "tail".
to the accumulator.
- Avoid postincrement addressing modes in the tail handling.
Judging from the way the main loop avoids using them even at the
cost of an extra instruction, they can be slower.
I wish there was a single instruction to set an ARM32 regiter to
0 or 1 depending on the carry but, but the closest I found was
sbc r, r, r which sets r to 0 or 2^32-1. A -1 offset can be
handled, but the problem is that 2^32-1 is *not* the same thing
as -1 when we're working mod 2^24-1.
diff --git a/mpn/arm/mod_34lsub1.asm b/mpn/arm/mod_34lsub1.asm
index 596cd3c7f..be21aae7f 100644
--- a/mpn/arm/mod_34lsub1.asm
+++ b/mpn/arm/mod_34lsub1.asm
@@ -54,24 +54,25 @@ ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_mod_34lsub1)
- push { r4, r5, r6, r7 }
subs n, n, #3
- mov r7, #0
blt L(le2) C n <= 2
- ldmia ap!, { r2, r3, r12 }
+ ldmia ap!, { r2, r3, r12 } C first 3 limbs into accumulator
subs n, n, #3
+ push { r4, r5, r6 }
blt L(sum) C n <= 5
cmn r0, #0 C clear carry
sub n, n, #3
b L(mid)
+C Add blocks of 3 limbs. The strange loop rotation leaves
+C maximum tine for the load to complete.
L(top): adcs r2, r2, r4
adcs r3, r3, r5
adcs r12, r12, r6
L(mid): ldmia ap!, { r4, r5, r6 }
- tst n, n
+ tst n, n C preserves carry
sub n, n, #3
bpl L(top)
@@ -80,45 +81,45 @@ L(mid): ldmia ap!, { r4, r5, r6 }
adcs r2, r2, r4
adcs r3, r3, r5
adcs r12, r12, r6
- movcs r7, #1 C r7 <= 1
-L(sum): cmn n, #2
+C Add the final 0..2 limbs
+L(sum): mov r6, #0
+ adc r6, r6, r6 C stash carry in r6
+ cmn n, #2
movlo r4, #0
ldrhs r4, [ap], #4
movls r5, #0
- ldrhi r5, [ap], #4
+ ldrhi r5, [ap]
adds r2, r2, r4
adcs r3, r3, r5
adcs r12, r12, #0
- adc r7, r7, #0 C r7 <= 2
+
+C At this point r0 (ap) and r1 (n) are available for the final
+C result summation. We sum into n because the L(le2) code can
+C clear it for free before jumping to L(sum1).
+
+ adc n, r6, r12, lsr #8
+ bic r12, r12, #0x0000ff00
+ add n, n, r12, lsl #16
+
+ pop { r4, r5, r6 }
L(sum2):
- bic r0, r2, #0xff000000
- add r0, r0, r2, lsr #24
- add r0, r0, r7
+ add n, n, r3, lsr #16
+ bic r3, r3, #0x00ff0000
+ add n, n, r3, lsl #8
- mov r7, r3, lsl #8
- bic r1, r7, #0xff000000
- add r0, r0, r1
- add r0, r0, r3, lsr #16
+L(sum1):
+ add n, n, r2, lsr #24
+ bic r2, r2, #0xff000000
+ add r0, n, r2
- mov r7, r12, lsl #16
- bic r1, r7, #0xff000000
- add r0, r0, r1
- add r0, r0, r12, lsr #8
-
- pop { r4, r5, r6, r7 }
return lr
-L(le2): cmn n, #1
- bne L(1)
- ldmia ap!, { r2, r3 }
- mov r12, #0
+L(le2): ldr r2, [ap], #4
+ bics n, n, n, asr #1 C clear n and put lsbit in carry
+ bcs L(sum1)
+ ldr r3, [ap]
b L(sum2)
-L(1): ldr r2, [ap]
- bic r0, r2, #0xff000000
- add r0, r0, r2, lsr #24
- pop { r4, r5, r6, r7 }
- return lr
EPILOGUE()
diff --git a/mpn/arm64/mod_34lsub1.asm b/mpn/arm64/mod_34lsub1.asm
index 7945fe72c..6d49c9a26 100644
--- a/mpn/arm64/mod_34lsub1.asm
+++ b/mpn/arm64/mod_34lsub1.asm
@@ -82,40 +82,35 @@ L(top): ldp x5, x6, [ap, #0]
adcs x4, x4, x7
tbz n, #63, L(top)
- adc x8, xzr, xzr C x8 <= 1
+ adc x8, xzr, xzr C x8 := carry
-L(sum): cmn n, #2
- mov x5, #0
- b.lo 1f
- ldr x5, [ap], #8
-1: mov x6, #0
- b.ls 1f
- ldr x6, [ap], #8
+C Fetch and add the final 0..2 limbs
+L(sum): adds x6, n, #2
+ b.lo L(sum3)
+ ldr x5, [ap]
+ b.eq 1f C x6 already clear if taken
+ ldr x6, [ap, #8]
1: adds x2, x2, x5
adcs x3, x3, x6
adcs x4, x4, xzr
- adc x8, x8, xzr C x8 <= 2
-
+C n is now available for summing
+L(sum3):
+ adc n, x8, x4, lsr #16
+ and x4, x4, #0xffff
+ add n, n, x4, lsl #32
L(sum2):
- and x0, x2, #0xffffffffffff
- add x0, x0, x2, lsr #48
- add x0, x0, x8
+ add n, n, x3, lsr #32
+ and x3, x3, #0xffffffff
+ add n, n, x3, lsl #16
- lsl x8, x3, #16
- and x1, x8, #0xffffffffffff
- add x0, x0, x1
- add x0, x0, x3, lsr #32
-
- lsl x8, x4, #32
- and x1, x8, #0xffffffffffff
- add x0, x0, x1
- add x0, x0, x4, lsr #16
+ add n, n, x2, lsr #48
+ and x2, x2, #0xffffffffffff
+ add x0, n, x2
ret
-L(le2): cmn n, #1
+L(le2): adds n, n, #1
b.ne L(1)
ldp x2, x3, [ap]
- mov x4, #0
b L(sum2)
L(1): ldr x2, [ap]
and x0, x2, #0xffffffffffff
More information about the gmp-devel
mailing list