[Gmp-commit] /home/hgfiles/gmp: Rewrite alpha mpn_mod_1s_4p_cps.

mercurial at gmplib.org mercurial at gmplib.org
Mon Mar 15 13:16:50 CET 2010


details:   /home/hgfiles/gmp/rev/66b94f02bf84
changeset: 13496:66b94f02bf84
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Mar 15 13:16:46 2010 +0100
description:
Rewrite alpha mpn_mod_1s_4p_cps.

diffstat:

 ChangeLog                 |    2 +
 mpn/alpha/ev6/mod_1_4.asm |  210 ++++++++++++++++++++++-----------------------
 2 files changed, 105 insertions(+), 107 deletions(-)

diffs (truncated from 313 to 300 lines):

diff -r 228d9deade5b -r 66b94f02bf84 ChangeLog
--- a/ChangeLog	Mon Mar 15 02:12:46 2010 +0100
+++ b/ChangeLog	Mon Mar 15 13:16:46 2010 +0100
@@ -1,5 +1,7 @@
 2010-03-15  Torbjorn Granlund  <tege at gmplib.org>
 
+	* mpn/alpha/ev6/mod_1_4.asm (mpn_mod_1s_4p_cps): Rewrite.
+
 	* mpn/ia64/aors_n.asm: Insert explicitly typed nops to trigger intended
 	bundling.
 	* mpn/ia64/aorslsh1_n.asm: Likewise.
diff -r 228d9deade5b -r 66b94f02bf84 mpn/alpha/ev6/mod_1_4.asm
--- a/mpn/alpha/ev6/mod_1_4.asm	Mon Mar 15 02:12:46 2010 +0100
+++ b/mpn/alpha/ev6/mod_1_4.asm	Mon Mar 15 13:16:46 2010 +0100
@@ -2,7 +2,7 @@
 
 dnl  Contributed to the GNU project by Torbjorn Granlund.
 
-dnl  Copyright 2009 Free Software Foundation, Inc.
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -24,8 +24,9 @@
 C TODO:
 C  * Optimise.  2.75 c/l should be possible.
 C  * Write a proper mpn_mod_1s_4p_cps.  The code below was compiler generated.
-C  * Make mpn_mod_1s_4p_cps work for ev4-ev5.
 C  * Optimise feed-in code, starting the sw pipeline in switch code.
+C  * Shorten software pipeline.  The mul instructions are scheduled too far
+C    from their users.
 C  * Use fewer registers.  Use r28 and r27.
 C  * If we cannot reduce register usage, write perhaps small-n basecase.
 C  * Does it work for PIC?
@@ -48,25 +49,20 @@
 define(`B5modb', `r5')
 
 ASM_START()
-	.arch	ev56
-
 PROLOGUE(mpn_mod_1s_4p)
-	lda	r30, -80(r30)
+	lda	r30, -64(r30)
 	stq	r9, 8(r30)
+	ldq	B1modb, 16(r19)
 	stq	r10, 16(r30)
+	ldq	B2modb, 24(r19)
 	stq	r11, 24(r30)
+	ldq	B3modb, 32(r19)
 	stq	r12, 32(r30)
+	ldq	B4modb, 40(r19)
 	stq	r13, 40(r30)
-	stq	r14, 48(r30)
-	stq	r15, 56(r30)
+	ldq	B5modb, 48(r19)
 	s8addq	n, ap, ap		C point ap at vector end
 
-	ldq	B1modb, 16(r19)
-	ldq	B2modb, 24(r19)
-	ldq	B3modb, 32(r19)
-	ldq	B4modb, 40(r19)
-	ldq	B5modb, 48(r19)
-
 	and	n, 3, r0
 	lda	n, -4(n)
 	beq	r0, L(b0)
@@ -100,7 +96,7 @@
 	mulq	r22, B2modb, r9
 	umulh	r22, B2modb, r13
 	mulq	r23, B3modb, r10
-	umulh	r23, B3modb, r14
+	umulh	r23, B3modb, r27
 	addq	r8, r20, pl
 	cmpult	pl, r8, r0
 	addq	r0, r12, ph
@@ -110,7 +106,7 @@
 	addq	r0, ph, ph
 	addq	r10, pl, rl
 	cmpult	rl, r10, r0
-	addq	r14, ph, ph
+	addq	r27, ph, ph
 	addq	r0, ph, rh
 	lda	ap, -64(ap)
 	br	L(com)
@@ -141,9 +137,9 @@
 	mulq	r22, B2modb, r9
 	umulh	r22, B2modb, r13
 	mulq	r23, B3modb, r10
-	umulh	r23, B3modb, r14
+	umulh	r23, B3modb, r27
 	mulq	rl, B4modb, r11
-	umulh	rl, B4modb, r15
+	umulh	rl, B4modb, r28
 	ble	n, L(ed2)
 
 	ALIGN(16)
@@ -165,21 +161,21 @@
 	addq	r10, pl, pl
 	mulq	r22, B2modb, r9
 	cmpult	pl, r10, r0
-	addq	r14, ph, ph
+	addq	r27, ph, ph
 	addq	r11, pl, pl
 	umulh	r22, B2modb, r13
 	addq	r0, ph, ph
 	cmpult	pl, r11, r0
-	addq	r15, ph, ph
+	addq	r28, ph, ph
 	mulq	r23, B3modb, r10
 	ldq	r20, 32(ap)
 	addq	pl, rl, rl
-	umulh	r23, B3modb, r14
+	umulh	r23, B3modb, r27
 	addq	r0, ph, ph
 	cmpult	rl, pl, r0
 	mulq	rl, B4modb, r11
 	addq	ph, rh, rh
-	umulh	rl, B4modb, r15
+	umulh	rl, B4modb, r28
 	addq	r0, rh, rh
 	lda	n, -4(n)
 	bgt	n, L(top)
@@ -195,11 +191,11 @@
 	addq	r0, ph, ph
 	addq	r10, pl, pl
 	cmpult	pl, r10, r0
-	addq	r14, ph, ph
+	addq	r27, ph, ph
 	addq	r11, pl, pl
 	addq	r0, ph, ph
 	cmpult	pl, r11, r0
-	addq	r15, ph, ph
+	addq	r28, ph, ph
 	addq	pl, rl, rl
 	addq	r0, ph, ph
 	cmpult	rl, pl, r0
@@ -245,92 +241,92 @@
 	ldq	r11, 24(r30)
 	ldq	r12, 32(r30)
 	ldq	r13, 40(r30)
-	ldq	r14, 48(r30)
-	ldq	r15, 56(r30)
-	lda	r30, 80(r30)
+	lda	r30, 64(r30)
 	ret	r31, (r26), 1
 EPILOGUE()
 
 PROLOGUE(mpn_mod_1s_4p_cps,gp)
-	ldgp	r29,	0(r27)
-	LEA(	r28,	__clz_tab)
-	lda	r30,	-32(r30)
-	lda	r5,	65(r31)
-	cmpbge	r31,	r17,	r8
-	stq	r26,	0(r30)
-	stq	r10,	16(r30)
-	srl	r8,	1,	r7
-	xor	r7,	127,	r6
-	stq	r11,	24(r30)
-	stq	r9,	8(r30)
-	bis	r31,	r16,	r11
-	addq	r6,	r28,	r4
-	ldbu	r2,	0(r4)
-	s8subq	r2,	7,	r3
-	srl	r17,	r3,	r27
-	subq	r5,	r3,	r26
-	addq	r27,	r28,	r10
-	ldbu	r9,	0(r10)
-	subq	r26,	r9,	r10
-	sll	r17,	r10,	r9
-	bis	r31,	r9,	r16
-	jsr	r26,	mpn_invert_limb
-	ldgp	r29,	0(r26)
-	stq	r10,	8(r11)
-	subq	r31,	r10,	r25
-	lda	r24,	1(r31)
-	subq	r31,	r9,	r20
-	stq	r0,	0(r11)
-	srl	r0,	r25,	r22
-	sll	r24,	r10,	r23
-	bis	r22,	r23,	r21
-	mulq	r20,	r21,	r1
-	umulh	r1,	r0,	r18
-	srl	r1,	r10,	r19
-	mulq	r1,	r0,	r8
-	stq	r19,	16(r11)
-	addq	r18,	r1,	r17
-	ornot	r31,	r17,	r16
-	mulq	r16,	r9,	r2
-	cmpule	r2,	r8,	r7
-	addq	r2,	r9,	r6
-	cmoveq	r7,	r6,	r2
-	umulh	r2,	r0,	r4
-	srl	r2,	r10,	r5
-	mulq	r2,	r0,	r27
-	stq	r5,	24(r11)
-	addq	r4,	r2,	r3
-	ornot	r31,	r3,	r28
-	mulq	r28,	r9,	r23
-	cmpule	r23,	r27,	r26
-	addq	r23,	r9,	r25
-	cmoveq	r26,	r25,	r23
-	ldq	r26,	0(r30)
-	umulh	r23,	r0,	r22
-	srl	r23,	r10,	r24
-	mulq	r23,	r0,	r19
-	stq	r24,	32(r11)
-	addq	r22,	r23,	r21
-	ornot	r31,	r21,	r20
-	mulq	r20,	r9,	r1
-	addq	r1,	r9,	r17
-	cmpule	r1,	r19,	r18
-	cmoveq	r18,	r17,	r1
-	umulh	r1,	r0,	r8
-	srl	r1,	r10,	r16
-	mulq	r1,	r0,	r5
-	stq	r16,	40(r11)
-	addq	r8,	r1,	r7
-	ornot	r31,	r7,	r6
-	mulq	r6,	r9,	r2
-	addq	r2,	r9,	r3
-	cmpule	r2,	r5,	r4
-	ldq	r9,	8(r30)
-	cmoveq	r4,	r3,	r2
-	srl	r2,	r10,	r0
-	ldq	r10,	16(r30)
-	stq	r0,	48(r11)
-	ldq	r11,	24(r30)
-	lda	r30,	32(r30)
-	ret	r31,	(r26),	1
+	lda	r30, -32(r30)
+	stq	r26, 0(r30)
+	stq	r9, 8(r30)
+	stq	r10, 16(r30)
+	stq	r11, 24(r30)
+	mov	r16, r11
+	LEA(	r4, __clz_tab)
+	lda	r10, 65(r31)
+	cmpbge	r31, r17, r1
+	srl	r1, 1, r1
+	xor	r1, 127, r1
+	addq	r1, r4, r1
+	ldq_u	r2, 0(r1)
+	extbl	r2, r1, r2
+	s8subq	r2, 7, r2
+	srl	r17, r2, r3
+	subq	r10, r2, r10
+	addq	r3, r4, r3
+	ldq_u	r1, 0(r3)
+	extbl	r1, r3, r1
+	subq	r10, r1, r10
+	sll	r17, r10, r9
+	mov	r9, r16
+	jsr	r26, mpn_invert_limb
+	ldah	r29, 0(r26)
+	subq	r31, r10, r2
+	lda	r1, 1(r31)
+	sll	r1, r10, r1
+	subq	r31, r9, r3
+	srl	r0, r2, r2
+	ldq	r26, 0(r30)
+	bis	r2, r1, r2
+	lda	r29, 0(r29)
+	stq	r0, 0(r11)
+	stq	r10, 8(r11)
+	mulq	r2, r3, r2
+	srl	r2, r10, r3
+	umulh	r2, r0, r1
+	stq	r3, 16(r11)
+	mulq	r2, r0, r3
+	ornot	r31, r1, r1
+	subq	r1, r2, r1
+	mulq	r1, r9, r1
+	addq	r1, r9, r2
+	cmpule	r1, r3, r3
+	cmoveq	r3, r2, r1
+	srl	r1, r10, r3
+	umulh	r1, r0, r2
+	stq	r3, 24(r11)
+	mulq	r1, r0, r3
+	ornot	r31, r2, r2
+	subq	r2, r1, r2
+	mulq	r2, r9, r2
+	addq	r2, r9, r1
+	cmpule	r2, r3, r3
+	cmoveq	r3, r1, r2
+	srl	r2, r10, r1
+	umulh	r2, r0, r3
+	stq	r1, 32(r11)
+	mulq	r2, r0, r1
+	ornot	r31, r3, r3
+	subq	r3, r2, r3
+	mulq	r3, r9, r3
+	addq	r3, r9, r2
+	cmpule	r3, r1, r1
+	cmoveq	r1, r2, r3
+	srl	r3, r10, r2
+	umulh	r3, r0, r1
+	stq	r2, 40(r11)
+	mulq	r3, r0, r0
+	ornot	r31, r1, r1


More information about the gmp-commit mailing list