[PATCH 2 of 3][v2] Add MADDU-based *mul_1.asm functions for MIPS32R1+

info at mobile-stream.com info at mobile-stream.com
Wed Nov 6 15:50:55 UTC 2019


Add MADDU-based *mul_1.asm implementation for MIPS32R1.

It is faster on all tried MIPS32R1/R2/R5 CPUs (see the c/l table) and is
expected to be fast with any pipelined MDU. Note the Area-Efficient MDU
(an optional non-pipelined Multiply-Divide Unit available for some MIPS
cores and MCUs) will run it much slower though (~3x for addmul_1).

The code tries to keep the [accidental] property of MIPS-II counterparts:
constant-time operation on 32x16 MDUs as found on e.g. 4KEc and some low-
end MCUs. Even if this is unimportant, the performance cost is invisible.

While functions look similar (especially mul_1 and addmul_1), they are
kept separate due to corner-case (N=1,2,3) tweaks for P5600 without any
ill effect on 4KEc or 24KEc at least.

v2 improves addmul_1 on P5600 to more natural ~7.5 c/l and adds 74Kc c/l.

diff -r 16691e684a95 -r 7423efa39db8 configure.ac
--- a/configure.ac
+++ b/configure.ac
@@ -1055,7 +1055,10 @@
 	    path_64="mips64/hilo mips64"
 	    ;;
 	esac
-
+        ;;
+
+      mipsisa32*-*-*)
+        path="mips32/r1 mips32"
         ;;
     esac
     ;;
diff -r 16691e684a95 -r 7423efa39db8 mpn/mips32/r1/addmul_1.asm
--- /dev/null
+++ b/mpn/mips32/r1/addmul_1.asm
@@ -0,0 +1,80 @@
+include(`../config.m4')
+
+C	     cycles/limb
+C 4KEc		 9.68
+C 24Kc		 9.52
+C 24KEc		 9.55
+C 74Kc		 7.05
+C P5600		 7.57
+C XBurst	13.55
+
+C INPUT PARAMETERS
+C rp		$a0
+C up		$a1
+C n		$a2
+C vl		$a3
+
+ASM_START()
+	.set	noat
+PROLOGUE(mpn_addmul_1)
+	lw	$v1,0($a1)	C L0
+	ori	$at,$zero,1
+	multu	$v1,$a3		C M0
+	lw	$t0,0($a0)	C L1, 32x16 MDU stall
+	addiu	$t1,$a2,-2
+	beq	$at,$a2,1f
+	 maddu	$t0,$at		C M0
+	mfhi	$v0		C M0 carry
+	lw	$t2,4($a1)	C L1
+	beqz	$t1,23f
+	 lw	$v1,4($a0)	C L1
+	mflo	$t0		C M0
+	andi	$t3,$t1,1
+	sll	$a2,$t1,2
+	beqz	$t3,0f
+	 addu	$a2,$a2,$a1
+	multu	$t2,$a3		C M1
+	lw	$t2,8($a1)	C L2, 32x16 MDU stall
+	maddu	$v1,$at		C M1
+	maddu	$v0,$at		C M1
+	mfhi	$v0		C M1 carry
+	lw	$v1,8($a0)	C L2
+	sw	$t0,0($a0)	C S0
+	beq	$at,$t1,23f
+	 addiu	$a0,$a0,4
+	addiu	$a1,$a1,4
+	mflo	$t0		C M1
+0:	addiu	$a1,$a1,8
+	addiu	$a0,$a0,8
+	multu	$t2,$a3		C M1
+	lw	$t3,0($a1)	C L2, 32x16 MDU stall
+	maddu	$v1,$at		C M1
+	lw	$t4,0($a0)	C L2
+	maddu	$v0,$at		C M1
+	mfhi	$v0		C M1 carry
+	sw	$t0,-8($a0)	C S0
+	mflo	$t1		C M1
+	multu	$t3,$a3		C M2
+	lw	$t2,4($a1)	C L3, 32x16 MDU stall
+	maddu	$t4,$at		C M2
+	lw	$v1,4($a0)	C L3
+	maddu	$v0,$at		C M2
+	sw	$t1,-4($a0)	C S1
+	mfhi	$v0		C M2 carry
+	bne	$a1,$a2,0b
+23:	 mflo	$t0		C M2
+	multu	$t2,$a3		C M3
+	nop			C     32x16 MDU stall
+	maddu	$v1,$at		C M3
+	maddu	$v0,$at		C M3
+	mflo	$at		C M3
+	mfhi	$v0		C M3 carry
+	sw	$t0,0($a0)	C S2
+	jr	$ra
+	 sw	$at,4($a0)	C S3
+1:	mflo	$at
+	mfhi	$v0
+	jr	$ra
+	 sw	$at,0($a0)
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff -r 16691e684a95 -r 7423efa39db8 mpn/mips32/r1/mul_1.asm
--- /dev/null
+++ b/mpn/mips32/r1/mul_1.asm
@@ -0,0 +1,70 @@
+include(`../config.m4')
+
+C	     cycles/limb
+C 4KEc		 7.66
+C 24Kc		 7.54
+C 24KEc		 7.55
+C 74Kc		 7.04
+C P5600		 7.04
+C XBurst	10.54
+
+C INPUT PARAMETERS
+C rp		$a0
+C up		$a1
+C n		$a2
+C vl		$a3
+
+ASM_START()
+	.set	noat
+PROLOGUE(mpn_mul_1)
+	lw	$v1,0($a1)	C L0
+	ori	$at,$zero,1
+	multu	$v1,$a3		C M0
+	beq	$at,$a2,1f	C     32x16 MDU stall
+	 addiu	$t1,$a2,-2
+	mfhi	$v0		C M0 carry
+	beqz	$t1,23f
+	 lw	$t2,4($a1)	C L1
+	mflo	$t0		C M0
+	andi	$t3,$t1,1
+	sll	$a2,$t1,2
+	beqz	$t3,0f
+	 addu	$a2,$a2,$a1
+	multu	$t2,$a3		C M1
+	lw	$t2,8($a1)	C L2, 32x16 MDU stall
+	maddu	$v0,$at		C M1
+	mfhi	$v0		C M1 carry
+	sw	$t0,0($a0)	C S0
+	beq	$at,$t1,23f
+	 addiu	$a0,$a0,4
+	addiu	$a1,$a1,4
+	mflo	$t0		C M1
+0:	addiu	$a1,$a1,8
+	addiu	$a0,$a0,8
+	multu	$t2,$a3		C M1
+	lw	$t3,0($a1)	C L2, 32x16 MDU stall
+	maddu	$v0,$at		C M1
+	mfhi	$v0		C M1 carry
+	sw	$t0,-8($a0)	C S0
+	mflo	$t1		C M1
+	multu	$t3,$a3		C M2
+	lw	$t2,4($a1)	C L3, 32x16 MDU stall
+	maddu	$v0,$at		C M2
+	mfhi	$v0		C M2 carry
+	sw	$t1,-4($a0)	C S1
+	bne	$a1,$a2,0b
+23:	 mflo	$t0		C M2
+	multu	$t2,$a3		C M3
+	nop			C     32x16 MDU stall
+	maddu	$v0,$at		C M3
+	mflo	$at		C M3
+	mfhi	$v0		C M3 carry
+	sw	$t0,0($a0)	C S2
+	jr	$ra
+	 sw	$at,4($a0)	C S3
+1:	mflo	$at
+	mfhi	$v0
+	jr	$ra
+	 sw	$at,0($a0)
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff -r 16691e684a95 -r 7423efa39db8 mpn/mips32/r1/submul_1.asm
--- /dev/null
+++ b/mpn/mips32/r1/submul_1.asm
@@ -0,0 +1,86 @@
+include(`../config.m4')
+
+C	     cycles/limb
+C 4KEc		10.72
+C 24Kc		10.54
+C 24KEc		10.55
+C 74Kc		 9.04
+C P5600		 8.07
+C XBurst	13.55
+
+C INPUT PARAMETERS
+C rp		$a0
+C up		$a1
+C n		$a2
+C vl		$a3
+
+ASM_START()
+	.set	noat
+PROLOGUE(mpn_submul_1)
+	lw	$v1,0($a0)	C L1
+	ori	$at,$zero,1
+	lw	$t0,0($a1)	C L0
+	multu	$v1,$at		C M0
+	msubu	$t0,$a3		C M0
+	beq	$at,$a2,1f	C     32x16 MDU stall
+	 addiu	$t1,$a2,-2
+	mfhi	$v0		C M0 carry
+	lw	$v1,4($a0)	C L1
+	beqz	$t1,23f
+	 lw	$t2,4($a1)	C L1
+	mflo	$t0		C M0
+	andi	$t3,$t1,1
+	sll	$a2,$t1,2
+	beqz	$t3,0f
+	 addu	$a2,$a2,$a1
+	negu	$v0		C M1
+	multu	$v1,$at		C M1
+	msubu	$t2,$a3		C M1
+	addiu	$a0,$a0,4	C     32x16 MDU stall
+	msubu	$v0,$at		C M1
+	mfhi	$v0		C M1 carry
+	lw	$v1,4($a0)	C L2
+	lw	$t2,8($a1)	C L2
+	beq	$at,$t1,23f
+	 sw	$t0,-4($a0)	C S0
+	addiu	$a1,$a1,4
+	mflo	$t0		C M1
+0:	addiu	$a0,$a0,8
+	addiu	$a1,$a1,8
+	multu	$v1,$at		C M1
+	lw	$t4,0($a0)	C L2
+	lw	$t3,0($a1)	C L2
+	msubu	$t2,$a3		C M1
+	negu	$v0		C M1, 32x16 MDU stall
+	msubu	$v0,$at		C M1
+	mfhi	$v0		C M1 carry
+	sw	$t0,-8($a0)	C S0
+	mflo	$t1		C M1
+	multu	$t4,$at		C M2
+	lw	$v1,4($a0)	C L3
+	lw	$t2,4($a1)	C L3
+	msubu	$t3,$a3		C M2
+	negu	$v0		C M2, 32x16 MDU stall
+	msubu	$v0,$at		C M2
+	mfhi	$v0		C M2 carry
+	sw	$t1,-4($a0)	C S1
+	bne	$a1,$a2,0b
+23:	 mflo	$t0		C M2
+	multu	$v1,$at		C M3
+	msubu	$t2,$a3		C M3
+	negu	$v0		C M3, 32x16 MDU stall
+	sw	$t0,0($a0)	C S2
+	msubu	$v0,$at		C M3
+	mflo	$at		C M3
+	mfhi	$v0		C M3 carry
+	sw	$at,4($a0)	C S3
+	jr	$ra
+	 negu	$v0
+1:	mflo	$at
+	mfhi	$v0
+	sw	$at,0($a0)
+	jr	$ra
+	 negu	$v0
+EPILOGUE(mpn_submul_1)
+
+ASM_END()





More information about the gmp-devel mailing list