[PATCH 3 of 3] Add MIPS32R1 MADDU-based *mul_1.asm functions
info at mobile-stream.com
info at mobile-stream.com
Wed Sep 11 18:23:35 UTC 2019
Add MIPS32R1 MADDU-based *mul_1.asm functions.
The code tries to keep the [accidental] property of MIPS-II counterparts:
constant-time operation on 32x16 MDUs as found on e.g. 4KEc and some low-
end MCUs. Even if that is unimportant, the performance cost is invisible.
It is faster on all tried MIPS32R1/R2/R5 CPUs (see the c/l table) and is
expected to be fast with any pipelined MDU. So-called Area-Efficient MDU
(optional on some MCUs) will run it *much* slower (~3x for addmul_1).
While functions look similar (especially mul_1 and addmul_1), they are
kept separate due to corner-case (N=1,2,3) tweaks for P5600 without any
ill effect on 4KEc or 24KEc at least.
diff -r 6ab06c72027e -r 789677d6e8b2 configure.ac
--- a/configure.ac
+++ b/configure.ac
@@ -1040,6 +1040,10 @@
mipsisa32r2*-*-*)
SPEED_CYCLECOUNTER_OBJ=mips32r2.lo
cyclecounter_size=1
+ path="mips32/r1 mips32"
+ ;;
+ mipsisa32*-*-*)
+ path="mips32/r1 mips32"
;;
esac
;;
diff -r 6ab06c72027e -r 789677d6e8b2 mpn/mips32/r1/addmul_1.asm
--- /dev/null
+++ b/mpn/mips32/r1/addmul_1.asm
@@ -0,0 +1,79 @@
+include(`../config.m4')
+
+C cycles/limb
+C 4KEc 9.68
+C 24Kc 9.52
+C 24KEc 9.55
+C P5600 7.80
+C XBurst 13.55
+
+C INPUT PARAMETERS
+C rp $a0
+C up $a1
+C n $a2
+C vl $a3
+
+ASM_START()
+ .set noat
+PROLOGUE(mpn_addmul_1)
+ lw $v1,0($a1) C L0
+ ori $at,$zero,1
+ multu $v1,$a3 C M0
+ lw $t0,0($a0) C L1, 32x16 MDU stall
+ addiu $t1,$a2,-2
+ beq $at,$a2,1f
+ maddu $t0,$at C M0
+ mfhi $v0 C M0 carry
+ lw $t2,4($a1) C L1
+ beqz $t1,23f
+ lw $v1,4($a0) C L1
+ mflo $t0 C M0
+ andi $t3,$t1,1
+ sll $a2,$t1,2
+ beqz $t3,0f
+ addu $a2,$a2,$a1
+ multu $t2,$a3 C M1
+ lw $t2,8($a1) C L2, 32x16 MDU stall
+ maddu $v1,$at C M1
+ maddu $v0,$at C M1
+ mfhi $v0 C M1 carry
+ lw $v1,8($a0) C L2
+ sw $t0,0($a0) C S0
+ beq $at,$t1,23f
+ addiu $a0,$a0,4
+ addiu $a1,$a1,4
+ mflo $t0 C M1
+0: addiu $a1,$a1,8
+ addiu $a0,$a0,8
+ multu $t2,$a3 C M1
+ lw $t3,0($a1) C L2, 32x16 MDU stall
+ maddu $v1,$at C M1
+ lw $t4,0($a0) C L2
+ maddu $v0,$at C M1
+ mfhi $v0 C M1 carry
+ sw $t0,-8($a0) C S0
+ mflo $t1 C M1
+ multu $t3,$a3 C M2
+ lw $t2,4($a1) C L3, 32x16 MDU stall
+ maddu $t4,$at C M2
+ lw $v1,4($a0) C L3
+ maddu $v0,$at C M2
+ mfhi $v0 C M2 carry
+ sw $t1,-4($a0) C S1
+ bne $a1,$a2,0b
+23: mflo $t0 C M2
+ multu $t2,$a3 C M3
+ nop C 32x16 MDU stall
+ maddu $v1,$at C M3
+ maddu $v0,$at C M3
+ mflo $at C M3
+ mfhi $v0 C M3 carry
+ sw $t0,0($a0) C S2
+ jr $ra
+ sw $at,4($a0) C S3
+1: mflo $at
+ mfhi $v0
+ jr $ra
+ sw $at,0($a0)
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff -r 6ab06c72027e -r 789677d6e8b2 mpn/mips32/r1/mul_1.asm
--- /dev/null
+++ b/mpn/mips32/r1/mul_1.asm
@@ -0,0 +1,69 @@
+include(`../config.m4')
+
+C cycles/limb
+C 4KEc 7.66
+C 24Kc 7.54
+C 24KEc 7.55
+C P5600 7.04
+C XBurst 10.54
+
+C INPUT PARAMETERS
+C rp $a0
+C up $a1
+C n $a2
+C vl $a3
+
+ASM_START()
+ .set noat
+PROLOGUE(mpn_mul_1)
+ lw $v1,0($a1) C L0
+ ori $at,$zero,1
+ multu $v1,$a3 C M0
+ beq $at,$a2,1f C 32x16 MDU stall
+ addiu $t1,$a2,-2
+ mfhi $v0 C M0 carry
+ beqz $t1,23f
+ lw $t2,4($a1) C L1
+ mflo $t0 C M0
+ andi $t3,$t1,1
+ sll $a2,$t1,2
+ beqz $t3,0f
+ addu $a2,$a2,$a1
+ multu $t2,$a3 C M1
+ lw $t2,8($a1) C L2, 32x16 MDU stall
+ maddu $v0,$at C M1
+ mfhi $v0 C M1 carry
+ sw $t0,0($a0) C S0
+ beq $at,$t1,23f
+ addiu $a0,$a0,4
+ addiu $a1,$a1,4
+ mflo $t0 C M1
+0: addiu $a1,$a1,8
+ addiu $a0,$a0,8
+ multu $t2,$a3 C M1
+ lw $t3,0($a1) C L2, 32x16 MDU stall
+ maddu $v0,$at C M1
+ mfhi $v0 C M1 carry
+ sw $t0,-8($a0) C S0
+ mflo $t1 C M1
+ multu $t3,$a3 C M2
+ lw $t2,4($a1) C L3, 32x16 MDU stall
+ maddu $v0,$at C M2
+ mfhi $v0 C M2 carry
+ sw $t1,-4($a0) C S1
+ bne $a1,$a2,0b
+23: mflo $t0 C M2
+ multu $t2,$a3 C M3
+ nop C 32x16 MDU stall
+ maddu $v0,$at C M3
+ mflo $at C M3
+ mfhi $v0 C M3 carry
+ sw $t0,0($a0) C S2
+ jr $ra
+ sw $at,4($a0) C S3
+1: mflo $at
+ mfhi $v0
+ jr $ra
+ sw $at,0($a0)
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff -r 6ab06c72027e -r 789677d6e8b2 mpn/mips32/r1/submul_1.asm
--- /dev/null
+++ b/mpn/mips32/r1/submul_1.asm
@@ -0,0 +1,85 @@
+include(`../config.m4')
+
+C cycles/limb
+C 4KEc 10.72
+C 24Kc 10.54
+C 24KEc 10.55
+C P5600 8.07
+C XBurst 13.55
+
+C INPUT PARAMETERS
+C rp $a0
+C up $a1
+C n $a2
+C vl $a3
+
+ASM_START()
+ .set noat
+PROLOGUE(mpn_submul_1)
+ lw $v1,0($a0) C L1
+ ori $at,$zero,1
+ lw $t0,0($a1) C L0
+ multu $v1,$at C M0
+ msubu $t0,$a3 C M0
+ beq $at,$a2,1f C 32x16 MDU stall
+ addiu $t1,$a2,-2
+ mfhi $v0 C M0 carry
+ lw $v1,4($a0) C L1
+ beqz $t1,23f
+ lw $t2,4($a1) C L1
+ mflo $t0 C M0
+ andi $t3,$t1,1
+ sll $a2,$t1,2
+ beqz $t3,0f
+ addu $a2,$a2,$a1
+ negu $v0 C M1
+ multu $v1,$at C M1
+ msubu $t2,$a3 C M1
+ addiu $a0,$a0,4 C 32x16 MDU stall
+ msubu $v0,$at C M1
+ mfhi $v0 C M1 carry
+ lw $v1,4($a0) C L2
+ lw $t2,8($a1) C L2
+ beq $at,$t1,23f
+ sw $t0,-4($a0) C S0
+ addiu $a1,$a1,4
+ mflo $t0 C M1
+0: addiu $a0,$a0,8
+ addiu $a1,$a1,8
+ multu $v1,$at C M1
+ lw $t4,0($a0) C L2
+ lw $t3,0($a1) C L2
+ msubu $t2,$a3 C M1
+ negu $v0 C M1, 32x16 MDU stall
+ msubu $v0,$at C M1
+ mfhi $v0 C M1 carry
+ sw $t0,-8($a0) C S0
+ mflo $t1 C M1
+ multu $t4,$at C M2
+ lw $v1,4($a0) C L3
+ lw $t2,4($a1) C L3
+ msubu $t3,$a3 C M2
+ negu $v0 C M2, 32x16 MDU stall
+ msubu $v0,$at C M2
+ mfhi $v0 C M2 carry
+ sw $t1,-4($a0) C S1
+ bne $a1,$a2,0b
+23: mflo $t0 C M2
+ multu $v1,$at C M3
+ msubu $t2,$a3 C M3
+ negu $v0 C M3, 32x16 MDU stall
+ sw $t0,0($a0) C S2
+ msubu $v0,$at C M3
+ mflo $at C M3
+ mfhi $v0 C M3 carry
+ sw $at,4($a0) C S3
+ jr $ra
+ negu $v0
+1: mflo $at
+ mfhi $v0
+ sw $at,0($a0)
+ jr $ra
+ negu $v0
+EPILOGUE(mpn_submul_1)
+
+ASM_END()
More information about the gmp-devel
mailing list