[PATCH] mpn/{arm,arm64}/mod_34lsub1: Code tweak

Sat Feb 14 17:25:05 CET 2026

This is just me playing around.  I don't have an ARM system to test
on, so it'll take some careful review and testing, or maybe it'll just
give someone some ideas.

I've avoided changing the carefully-tuned main loops, but improve the
setup/reduce code around them.
    
ARM32 improvements:
- Avoid using r7, saving one push/pop.  Instead, r6 is reused
  for the carry.
- For lengths of 1 or 2 limbs, skip push/pop entirely.
- Use shift+add instructions better during the reduction.
  This requires masking *before* shifting, which can be
  done if we're clever about the immediates.  We don't have
  to mask off bits that are shifted away.
- The "bics n, n, n, asr #1" instruction is definitely subtle.
  Since we're starting with n == -1 or -2 (length 2 or 1, minus 3),
  this copies the lsbit of n to the carry flag, and clears n.
    
The ARM64 code is sufficiently similar that the changes are easily
ported.  Two more changes there:
    
- If the length is a multiple of 3, we can jump straight to the
  reduction and skip adding an all-zero "tail".
  to the accumulator.
- Avoid postincrement addressing modes in the tail handling.
  Judging from the way the main loop avoids using them even at the
  cost of an extra instruction, they can be slower.
    
I wish there was a single instruction to set an ARM32 regiter to
0 or 1 depending on the carry but, but the closest I found was
sbc r, r, r which sets r to 0 or 2^32-1.  A -1 offset can be
handled, but the problem is that 2^32-1 is *not* the same thing
as -1 when we're working mod 2^24-1.

diff --git a/mpn/arm/mod_34lsub1.asm b/mpn/arm/mod_34lsub1.asm
index 596cd3c7f..be21aae7f 100644
--- a/mpn/arm/mod_34lsub1.asm
+++ b/mpn/arm/mod_34lsub1.asm
@@ -54,24 +54,25 @@ ASM_START()
 	TEXT
 	ALIGN(32)
 PROLOGUE(mpn_mod_34lsub1)
-	push	{ r4, r5, r6, r7 }
 
 	subs	n, n, #3
-	mov	r7, #0
 	blt	L(le2)			C n <= 2
 
-	ldmia	ap!, { r2, r3, r12 }
+	ldmia	ap!, { r2, r3, r12 }	C first 3 limbs into accumulator
 	subs	n, n, #3
+	push	{ r4, r5, r6 }
 	blt	L(sum)			C n <= 5
 	cmn	r0, #0			C clear carry
 	sub	n, n, #3
 	b	L(mid)
 
+C Add blocks of 3 limbs.  The strange loop rotation leaves
+C maximum tine for the load to complete.
 L(top):	adcs	r2, r2, r4
 	adcs	r3, r3, r5
 	adcs	r12, r12, r6
 L(mid):	ldmia	ap!, { r4, r5, r6 }
-	tst	n, n
+	tst	n, n			C preserves carry
 	sub	n, n, #3
 	bpl	L(top)
 
@@ -80,45 +81,45 @@ L(mid):	ldmia	ap!, { r4, r5, r6 }
 	adcs	r2, r2, r4
 	adcs	r3, r3, r5
 	adcs	r12, r12, r6
-	movcs	r7, #1			C r7 <= 1
 
-L(sum):	cmn	n, #2
+C Add the final 0..2 limbs
+L(sum): mov	r6, #0
+	adc	r6, r6, r6		C stash carry in r6
+	cmn	n, #2
 	movlo	r4, #0
 	ldrhs	r4, [ap], #4
 	movls	r5, #0
-	ldrhi	r5, [ap], #4
+	ldrhi	r5, [ap]
 
 	adds	r2, r2, r4
 	adcs	r3, r3, r5
 	adcs	r12, r12, #0
-	adc	r7, r7, #0		C r7 <= 2
+
+C At this point r0 (ap) and r1 (n) are available for the final
+C result summation.  We sum into n because the L(le2) code can
+C clear it for free before jumping to L(sum1).
+
+	adc	n, r6, r12, lsr #8
+	bic	r12, r12, #0x0000ff00
+	add	n, n, r12, lsl #16
+
+	pop	{ r4, r5, r6 }
 
 L(sum2):
-	bic	r0, r2, #0xff000000
-	add	r0, r0, r2, lsr #24
-	add	r0, r0, r7
+	add	n, n, r3, lsr #16
+	bic	r3, r3, #0x00ff0000
+	add	n, n, r3, lsl #8
 
-	mov	r7, r3, lsl #8
-	bic	r1, r7, #0xff000000
-	add	r0, r0, r1
-	add	r0, r0, r3, lsr #16
+L(sum1):
+	add	n, n, r2, lsr #24
+	bic	r2, r2, #0xff000000
+	add	r0, n, r2
 
-	mov	r7, r12, lsl #16
-	bic	r1, r7, #0xff000000
-	add	r0, r0, r1
-	add	r0, r0, r12, lsr #8
-
-	pop	{ r4, r5, r6, r7 }
 	return	lr
 
-L(le2):	cmn	n, #1
-	bne	L(1)
-	ldmia	ap!, { r2, r3 }
-	mov	r12, #0
+L(le2):	ldr	r2, [ap], #4
+	bics	n, n, n, asr #1		C clear n and put lsbit in carry
+	bcs	L(sum1)
+	ldr	r3, [ap]
 	b	L(sum2)
-L(1):	ldr	r2, [ap]
-	bic	r0, r2, #0xff000000
-	add	r0, r0, r2, lsr #24
-	pop	{ r4, r5, r6, r7 }
-	return	lr
 EPILOGUE()
diff --git a/mpn/arm64/mod_34lsub1.asm b/mpn/arm64/mod_34lsub1.asm
index 7945fe72c..6d49c9a26 100644
--- a/mpn/arm64/mod_34lsub1.asm
+++ b/mpn/arm64/mod_34lsub1.asm
@@ -82,40 +82,35 @@ L(top):	ldp	x5, x6, [ap, #0]
 	adcs	x4, x4, x7
 	tbz	n, #63, L(top)
 
-	adc	x8, xzr, xzr		C x8 <= 1
+	adc	x8, xzr, xzr		C x8 := carry
 
-L(sum):	cmn	n, #2
-	mov	x5, #0
-	b.lo	1f
-	ldr	x5, [ap], #8
-1:	mov	x6, #0
-	b.ls	1f
-	ldr	x6, [ap], #8
+C Fetch and add the final 0..2 limbs
+L(sum):	adds	x6, n, #2
+	b.lo	L(sum3)
+	ldr	x5, [ap]
+	b.eq	1f			C x6 already clear if taken
+	ldr	x6, [ap, #8]
 1:	adds	x2, x2, x5
 	adcs	x3, x3, x6
 	adcs	x4, x4, xzr
-	adc	x8, x8, xzr		C x8 <= 2
-
+C n is now available for summing
+L(sum3):
+	adc	n, x8, x4, lsr #16
+	and	x4, x4, #0xffff
+	add	n, n, x4, lsl #32
 L(sum2):
-	and	x0, x2, #0xffffffffffff
-	add	x0, x0, x2, lsr #48
-	add	x0, x0, x8
+	add	n, n, x3, lsr #32
+	and	x3, x3, #0xffffffff
+	add	n, n, x3, lsl #16
 
-	lsl	x8, x3, #16
-	and	x1, x8, #0xffffffffffff
-	add	x0, x0, x1
-	add	x0, x0, x3, lsr #32
-
-	lsl	x8, x4, #32
-	and	x1, x8, #0xffffffffffff
-	add	x0, x0, x1
-	add	x0, x0, x4, lsr #16
+	add	n, n, x2, lsr #48
+	and	x2, x2, #0xffffffffffff
+	add	x0, n, x2
 	ret
 
-L(le2):	cmn	n, #1
+L(le2):	adds	n, n, #1
 	b.ne	L(1)
 	ldp	x2, x3, [ap]
-	mov	x4, #0
 	b	L(sum2)
 L(1):	ldr	x2, [ap]
 	and	x0, x2, #0xffffffffffff