[Gmp-commit] /home/hgfiles/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun Jan 23 15:39:16 CET 2011
details: /home/hgfiles/gmp/rev/4f0837567e84
changeset: 13763:4f0837567e84
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Jan 23 14:06:53 2011 +0100
description:
Fix typo.
details: /home/hgfiles/gmp/rev/c5743d069e2f
changeset: 13764:c5743d069e2f
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Jan 23 15:39:08 2011 +0100
description:
Rewrite, adding mpn_addmul_2s entry point.
diffstat:
ChangeLog | 4 +
mpn/generic/sqr_basecase.c | 1 +
mpn/ia64/addmul_2.asm | 961 +++++++++++++++++++++++---------------------
3 files changed, 506 insertions(+), 460 deletions(-)
diffs (truncated from 1110 to 300 lines):
diff -r 389ed05793c7 -r c5743d069e2f ChangeLog
--- a/ChangeLog Sat Jan 22 22:30:31 2011 +0100
+++ b/ChangeLog Sun Jan 23 15:39:08 2011 +0100
@@ -1,3 +1,7 @@
+2011-01-23 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/ia64/addmul_2.asm: Rewrite, adding mpn_addmul_2s entry point.
+
2011-01-22 Torbjorn Granlund <tege at gmplib.org>
* mpn/ia64/aors_n.asm: Fix some incorrect bundle types.
diff -r 389ed05793c7 -r c5743d069e2f mpn/generic/sqr_basecase.c
--- a/mpn/generic/sqr_basecase.c Sat Jan 22 22:30:31 2011 +0100
+++ b/mpn/generic/sqr_basecase.c Sun Jan 23 15:39:08 2011 +0100
@@ -58,6 +58,7 @@
rp[2 * n - 1] += cy; \
} while (0)
#else
+#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n) \
do { \
mp_limb_t cy; \
MPN_SQR_DIAGONAL (rp, up, n); \
diff -r 389ed05793c7 -r c5743d069e2f mpn/ia64/addmul_2.asm
--- a/mpn/ia64/addmul_2.asm Sat Jan 22 22:30:31 2011 +0100
+++ b/mpn/ia64/addmul_2.asm Sun Jan 23 15:39:08 2011 +0100
@@ -3,7 +3,7 @@
dnl Contributed to the GNU project by Torbjorn Granlund.
-dnl Copyright 2004, 2005 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -32,10 +32,8 @@
C TODO
C * Clean up variable names, and try to decrease the number of distinct
C registers used.
-C * Cleanup feed-in code to not require zeroing several registers.
-C * Make sure we don't depend on uninitialized predicate registers.
-C * We currently cross-jump very aggressively, at the expense of a few cycles
-C per operation. Consider changing that.
+C * Clean up feed-in code to not require zeroing several registers.
+C * Make sure we don't depend on uninitialised predicate registers.
C * Could perhaps save a few cycles by using 1 c/l carry propagation in
C wind-down code.
C * Ultimately rewrite. The problem with this code is that it first uses a
@@ -96,564 +94,607 @@
define(`uy',`f51')
ASM_START()
+PROLOGUE(mpn_addmul_2s)
+ .prologue
+ .save ar.lc, r2
+ .body
+
+ifdef(`HAVE_ABI_32',`
+.mmi; addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+.mmi; nop 1
+ nop 1
+ zxt4 n = n C I
+ ;;')
+
+.mmi; ldf8 ux = [up], 8 C M
+ ldf8 v0 = [vp], 8 C M
+ mov r2 = ar.lc C I0
+.mmi; ldf8 rx = [rp], 8 C M
+ and r14 = 3, n C M I
+ add n = -2, n C M I
+ ;;
+.mmi; ldf8 uy = [up], 8 C M
+ ldf8 v1 = [vp] C M
+ shr.u n = n, 2 C I0
+.mmi; ldf8 ry = [rp], -8 C M
+ cmp.eq p14, p0 = 1, r14 C M I
+ cmp.eq p11, p0 = 2, r14 C M I
+ ;;
+.mmi; add srp = 16, rp C M I
+ cmp.eq p15, p0 = 3, r14 C M I
+ mov ar.lc = n C I0
+.bbb; (p14) br.dptk L(x01) C B
+ (p11) br.dptk L(x10) C B
+ (p15) br.dptk L(x11) C B
+ ;;
+
+L(x00): cmp.ne p6, p0 = r0, r0 C suppress initial xma pair
+ mov fp2a_3 = f0
+ br L(b00)
+L(x01): cmp.ne p14, p0 = r0, r0 C suppress initial xma pair
+ mov fp2a_2 = f0
+ br L(b01)
+L(x10): cmp.ne p11, p0 = r0, r0 C suppress initial xma pair
+ mov fp2a_1 = f0
+ br L(b10)
+L(x11): cmp.ne p15, p0 = r0, r0 C suppress initial xma pair
+ mov fp2a_0 = f0
+ br L(b11)
+
+EPILOGUE()
+
PROLOGUE(mpn_addmul_2)
.prologue
.save ar.lc, r2
.body
ifdef(`HAVE_ABI_32',
-` addp4 rp = 0, rp C M I
- addp4 up = 0, up C M I
- addp4 vp = 0, vp C M I
- zxt4 n = n C I
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+ nop 1
+ nop 1
+ zxt4 n = n C I
;;')
-{.mmi C 00
- ldf8 ux = [up], 8 C M
- ldf8 v0 = [vp], 8 C M
- mov.i r2 = ar.lc C I0
-}{.mmi
- ldf8 rx = [rp], 8 C M
- and r14 = 3, n C M I
- add n = -2, n C M I
+.mmi; ldf8 ux = [up], 8 C M
+ ldf8 v0 = [vp], 8 C M
+ mov r2 = ar.lc C I0
+.mmi; ldf8 rx = [rp], 8 C M
+ and r14 = 3, n C M I
+ add n = -2, n C M I
;;
-}{.mmi C 01
- ldf8 uy = [up], 8 C M
- ldf8 v1 = [vp] C M
- shr.u n = n, 2 C I0
-}{.mmi
- ldf8 ry = [rp], -8 C M
- cmp.eq p10, p0 = 1, r14 C M I
- cmp.eq p11, p0 = 2, r14 C M I
+.mmi; ldf8 uy = [up], 8 C M
+ ldf8 v1 = [vp] C M
+ shr.u n = n, 2 C I0
+.mmi; ldf8 ry = [rp], -8 C M
+ cmp.eq p14, p0 = 1, r14 C M I
+ cmp.eq p11, p0 = 2, r14 C M I
;;
-}{.mmi C 02
- add srp = 16, rp C M I
- cmp.eq p12, p0 = 3, r14 C M I
- mov.i ar.lc = n C I0
-}{.bbb
- (p10) br.dptk .Lb01 C B
- (p11) br.dptk .Lb10 C B
- (p12) br.dptk .Lb11 C B
+.mmi; add srp = 16, rp C M I
+ cmp.eq p15, p6 = 3, r14 C M I
+ mov ar.lc = n C I0
+.bbb; (p14) br.dptk L(b01) C B
+ (p11) br.dptk L(b10) C B
+ (p15) br.dptk L(b11) C B
;;
-}
ALIGN(32)
-.Lb00: ldf8 r_1 = [srp], 8
- ldf8 u_1 = [up], 8
- mov acc1_2 = 0
- mov pr1_2 = 0
- mov pr0_3 = 0
- cmp.ne p8, p9 = r0, r0
+L(b00):
+.mmi; ldf8 r_1 = [srp], 8
+ ldf8 u_1 = [up], 8
+ mov acc1_2 = 0
+.mmi; mov pr1_2 = 0
+ mov pr0_3 = 0
+ cmp.ne p8, p9 = r0, r0
;;
- ldf8 r_2 = [srp], 8
- xma.l fp0b_3 = ux, v0, rx
- cmp.ne p12, p13 = r0, r0
- ldf8 u_2 = [up], 8
- xma.hu fp1a_3 = ux, v0, rx
- br.cloop.dptk .grt4
+.mfi; ldf8 r_2 = [srp], 8
+ xma.l fp0b_3 = ux, v0, rx
+ cmp.ne p12, p13 = r0, r0
+.mfb; ldf8 u_2 = [up], 8
+ xma.hu fp1b_3 = ux, v0, rx
+ br.cloop.dptk L(gt4)
- xma.l fp0b_0 = uy, v0, ry
- xma.hu fp1a_0 = uy, v0, ry
+ xma.l fp0b_0 = uy, v0, ry
+ xma.hu fp1a_0 = uy, v0, ry
;;
- getf.sig acc0 = fp0b_3
- xma.l fp1b_3 = ux, v1, fp1a_3
- xma.hu fp2a_3 = ux, v1, fp1a_3
+ getfsig acc0 = fp0b_3
+ (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
+ (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
;;
- xma.l fp0b_1 = u_1, v0, r_1
- xma.hu fp1a_1 = u_1, v0, r_1
+ xma.l fp0b_1 = u_1, v0, r_1
+ xma.hu fp1a_1 = u_1, v0, r_1
;;
- getf.sig pr0_0 = fp0b_0
- xma.l fp1b_0 = uy, v1, fp1a_0
- xma.hu fp2a_0 = uy, v1, fp1a_0
+ getfsig pr0_0 = fp0b_0
+ xma.l fp1b_0 = uy, v1, fp1a_0
+ xma.hu fp2a_0 = uy, v1, fp1a_0
;;
- getf.sig pr1_3 = fp1b_3
- getf.sig acc1_3 = fp2a_3
- xma.l fp0b_2 = u_2, v0, r_2
- xma.hu fp1a_2 = u_2, v0, r_2
- br .Lcj4
+ getfsig pr1_3 = fp1b_3
+ getfsig acc1_3 = fp2a_3
+ xma.l fp0b_2 = u_2, v0, r_2
+ xma.hu fp1a_2 = u_2, v0, r_2
+ br L(cj4)
-.grt4: xma.l fp0b_0 = uy, v0, ry
- xma.hu fp1a_0 = uy, v0, ry
+L(gt4): xma.l fp0b_0 = uy, v0, ry
+ xma.hu fp1a_0 = uy, v0, ry
;;
- ldf8 r_3 = [srp], 8
- getf.sig acc0 = fp0b_3
- xma.l fp1b_3 = ux, v1, fp1a_3
- ldf8 u_3 = [up], 8
- xma.hu fp2a_3 = ux, v1, fp1a_3
+ ldf8 r_3 = [srp], 8
+ getfsig acc0 = fp0b_3
+ (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
+ ldf8 u_3 = [up], 8
+ (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
;;
- xma.l fp0b_1 = u_1, v0, r_1
- xma.hu fp1a_1 = u_1, v0, r_1
+ xma.l fp0b_1 = u_1, v0, r_1
+ xma.hu fp1a_1 = u_1, v0, r_1
;;
- ldf8 r_0 = [srp], 8
- getf.sig pr0_0 = fp0b_0
- xma.l fp1b_0 = uy, v1, fp1a_0
- xma.hu fp2a_0 = uy, v1, fp1a_0
+ ldf8 r_0 = [srp], 8
+ getfsig pr0_0 = fp0b_0
+ xma.l fp1b_0 = uy, v1, fp1a_0
+ xma.hu fp2a_0 = uy, v1, fp1a_0
;;
- ldf8 u_0 = [up], 8
- getf.sig pr1_3 = fp1b_3
+ ldf8 u_0 = [up], 8
+ getfsig pr1_3 = fp1b_3
+ xma.l fp0b_2 = u_2, v0, r_2
;;
- getf.sig acc1_3 = fp2a_3
- xma.l fp0b_2 = u_2, v0, r_2
- xma.hu fp1a_2 = u_2, v0, r_2
- br .LL00
+ getfsig acc1_3 = fp2a_3
+ xma.hu fp1a_2 = u_2, v0, r_2
+ br L(00)
ALIGN(32)
-.Lb01: ldf8 r_0 = [srp], 8 C M
- ldf8 u_0 = [up], 8 C M
- mov acc1_1 = 0 C M I
- mov pr1_1 = 0 C M I
- mov pr0_2 = 0 C M I
- cmp.ne p6, p7 = r0, r0 C M I
+L(b01):
+.mmi; ldf8 r_0 = [srp], 8 C M
+ ldf8 u_0 = [up], 8 C M
+ mov acc1_1 = 0 C M I
+.mmi; mov pr1_1 = 0 C M I
+ mov pr0_2 = 0 C M I
+ cmp.ne p6, p7 = r0, r0 C M I
;;
- ldf8 r_1 = [srp], 8 C M
- xma.l fp0b_2 = ux, v0, rx C F
- cmp.ne p10, p11 = r0, r0 C M I
- ldf8 u_1 = [up], 8 C M
- xma.hu fp1a_2 = ux, v0, rx C F
+.mfi; ldf8 r_1 = [srp], 8 C M
+ xma.l fp0b_2 = ux, v0, rx C F
+ cmp.ne p10, p11 = r0, r0 C M I
+.mfi; ldf8 u_1 = [up], 8 C M
+ xma.hu fp1b_2 = ux, v0, rx C F
More information about the gmp-commit
mailing list