[Gmp-commit] /home/hgfiles/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Mar 20 17:41:11 CET 2010
details: /home/hgfiles/gmp/rev/d6a326f84f20
changeset: 13519:d6a326f84f20
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Mar 20 10:47:51 2010 +0100
description:
Correct cycle counts.
details: /home/hgfiles/gmp/rev/0140ec5813b2
changeset: 13520:0140ec5813b2
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Mar 20 15:18:44 2010 +0100
description:
Rewrite to exploit cancellation in the Newton iteration.
details: /home/hgfiles/gmp/rev/e7fa613f9dda
changeset: 13521:e7fa613f9dda
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Mar 20 17:41:00 2010 +0100
description:
Fix comment typos.
diffstat:
ChangeLog | 5 +
mpn/powerpc64/mode64/aorslsh1_n.asm | 2 +-
mpn/powerpc64/mode64/aorslsh2_n.asm | 2 +-
mpn/powerpc64/mode64/aorslshC_n.asm | 4 +-
mpn/powerpc64/mode64/invert_limb.asm | 153 ++++++++++++++++------------------
mpn/x86_64/invert_limb.asm | 4 +-
6 files changed, 84 insertions(+), 86 deletions(-)
diffs (254 lines):
diff -r 04abb4422ccb -r e7fa613f9dda ChangeLog
--- a/ChangeLog Sat Mar 20 09:59:39 2010 +0100
+++ b/ChangeLog Sat Mar 20 17:41:00 2010 +0100
@@ -1,3 +1,8 @@
+2010-03-20 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/powerpc64/mode64/invert_limb.asm: Rewrite to exploit cancellation
+ in the Newton iteration.
+
2010-03-20 Marco Bodrato <bodrato at mail.dm.unipi.it>
* mpn/generic/toom_interpolate_8pts.c: Use mpn_sublsh2_n.
diff -r 04abb4422ccb -r e7fa613f9dda mpn/powerpc64/mode64/aorslsh1_n.asm
--- a/mpn/powerpc64/mode64/aorslsh1_n.asm Sat Mar 20 09:59:39 2010 +0100
+++ b/mpn/powerpc64/mode64/aorslsh1_n.asm Sat Mar 20 17:41:00 2010 +0100
@@ -20,7 +20,7 @@
include(`../config.m4')
C cycles/limb
-C POWER3/PPC630: 2 (1.5 c/l should be possible)
+C POWER3/PPC630: 1.83 (1.5 c/l should be possible)
C POWER4/PPC970: 3 (2.0 c/l should be possible)
C POWER5: 3
diff -r 04abb4422ccb -r e7fa613f9dda mpn/powerpc64/mode64/aorslsh2_n.asm
--- a/mpn/powerpc64/mode64/aorslsh2_n.asm Sat Mar 20 09:59:39 2010 +0100
+++ b/mpn/powerpc64/mode64/aorslsh2_n.asm Sat Mar 20 17:41:00 2010 +0100
@@ -20,7 +20,7 @@
include(`../config.m4')
C cycles/limb
-C POWER3/PPC630: 2 (1.5 c/l should be possible)
+C POWER3/PPC630: 1.83 (1.5 c/l should be possible)
C POWER4/PPC970: 3 (2.0 c/l should be possible)
C POWER5: 3
diff -r 04abb4422ccb -r e7fa613f9dda mpn/powerpc64/mode64/aorslshC_n.asm
--- a/mpn/powerpc64/mode64/aorslshC_n.asm Sat Mar 20 09:59:39 2010 +0100
+++ b/mpn/powerpc64/mode64/aorslshC_n.asm Sat Mar 20 17:41:00 2010 +0100
@@ -20,7 +20,7 @@
include(`../config.m4')
C cycles/limb
-C POWER3/PPC630: 2 (1.5 c/l should be possible)
+C POWER3/PPC630: 1.83 (1.5 c/l should be possible)
C POWER4/PPC970: 3 (2.0 c/l should be possible)
C POWER5: 3
@@ -31,8 +31,6 @@
C n r6
C STATUS
-C * Works for all sizes. Needs optimization and cleanup of feed-in code.
-C * Combine u0 and u1.
C * Try combining upx+up, and vpx+vp.
define(`rp',`r3')
diff -r 04abb4422ccb -r e7fa613f9dda mpn/powerpc64/mode64/invert_limb.asm
--- a/mpn/powerpc64/mode64/invert_limb.asm Sat Mar 20 09:59:39 2010 +0100
+++ b/mpn/powerpc64/mode64/invert_limb.asm Sat Mar 20 17:41:00 2010 +0100
@@ -1,6 +1,6 @@
dnl PowerPC-64 mpn_invert_limb -- Invert a normalized limb.
-dnl Copyright 2004, 2005, 2006, 2008 Free Software Foundation, Inc.
+dnl Copyright 2004, 2005, 2006, 2008, 2010 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -19,91 +19,86 @@
include(`../config.m4')
-C cycles/limb
-C POWER3/PPC630: ?
-C POWER4/PPC970: 75 (including call+ret)
-
-C TODO:
-C * Pair multiply instructions.
+C cycles/limb (approximate)
+C POWER3/PPC630: 80
+C POWER4/PPC970: 86
+C POWER5: 86
ASM_START()
PROLOGUE(mpn_invert_limb)
LEAL( r12, approx_tab)
-
- srdi r11, r3, 32 C r11 = d >> 32
- rlwinm r9, r11, 10, 23, 30 C r9 = ((d >> 55) & 0xff) << 1
- lhzx r0, r12, r9 C load initial approximation
- rldic r10, r0, 6, 42
- mulld r8, r10, r10
- sldi r9, r10, 17
- mulld r0, r8, r11
- srdi r0, r0, 31
- subf r10, r0, r9
- mulld r8, r10, r10
- sldi r11, r10, 33
- mulhdu r0, r8, r3
- sldi r9, r0, 1
- subf r10, r9, r11
- sldi r11, r10, 2
- mulhdu r0, r10, r10
- mulld r8, r10, r10
- mulhdu r10, r8, r3
- mulld r9, r0, r3
- mulhdu r0, r0, r3
- addc r8, r9, r10
- addze r10, r0
- srdi r0, r8, 62
- rldimi r0, r10, 2, 0
- sldi r9, r8, 2
- subfic r10, r9, 0
- subfe r8, r0, r11
- mulhdu r10, r3, r8
- add r10, r10, r3
- mulld r9, r3, r8
- subf r11, r10, r8
- addi r0, r10, 1
- addi r8, r11, -1
- and r0, r3, r0
- addc r11, r9, r0
- addze r10, r10
- addc r0, r11, r3
- addze r10, r10
- subf r3, r10, r8
+ srdi r9, r3, 55
+ rlwinm r9, r9, 0, 23, 30 C (d >> 55) & 0x1fe
+ srdi r10, r3, 24 C d >> 24
+ lis r11, 0x1000
+ rldicl r8, r3, 0, 63 C d mod 2
+ addi r10, r10, 1 C d40
+ sldi r11, r11, 32 C 2^60
+ srdi r7, r3, 1 C d/2
+ add r7, r7, r8 C d63 = ceil(d/2)
+ neg r8, r8 C mask = -(d mod 2)
+ lhzx r0, r9, r12
+ mullw r9, r0, r0 C v0*v0
+ sldi r6, r0, 11 C v0 << 11
+ addi r0, r6, -1 C (v0 << 11) - 1
+ mulld r9, r9, r10 C v0*v0*d40
+ srdi r9, r9, 40 C v0*v0*d40 >> 40
+ subf r9, r9, r0 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
+ mulld r0, r9, r10 C v1*d40
+ sldi r6, r9, 13 C v1 << 13
+ subf r0, r0, r11 C 2^60 - v1*d40
+ mulld r0, r0, r9 C v1 * (2^60 - v1*d40)
+ srdi r0, r0, 47 C v1 * (2^60 - v1*d40) >> 47
+ add r0, r0, r6 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
+ mulld r11, r0, r7 C v2 * d63
+ srdi r10, r0, 1 C v2 >> 1
+ sldi r9, r0, 31 C v2 << 31
+ and r8, r10, r8 C (v2 >> 1) & mask
+ subf r8, r11, r8 C ((v2 >> 1) & mask) - v2 * d63
+ mulhdu r0, r8, r0 C p1 = v2 * (((v2 >> 1) & mask) - v2 * d63)
+ srdi r0, r0, 1 C p1 >> 1
+ add r0, r0, r9 C v3 = (v2 << 31) + (p1 >> 1)
+ nop
+ mulhdu r9, r0, r3
+ mulld r11, r0, r3
+ addc r10, r11, r3
+ adde r3, r9, r3
+ subf r3, r3, r0
blr
EPILOGUE()
DEF_OBJECT(approx_tab)
- .short 1023,1020,1016,1012,1008,1004,1000,996
- .short 992,989,985,981,978,974,970,967
- .short 963,960,956,953,949,946,942,939
- .short 936,932,929,926,923,919,916,913
- .short 910,907,903,900,897,894,891,888
- .short 885,882,879,876,873,870,868,865
- .short 862,859,856,853,851,848,845,842
- .short 840,837,834,832,829,826,824,821
- .short 819,816,814,811,809,806,804,801
- .short 799,796,794,791,789,787,784,782
- .short 780,777,775,773,771,768,766,764
- .short 762,759,757,755,753,751,748,746
- .short 744,742,740,738,736,734,732,730
- .short 728,726,724,722,720,718,716,714
- .short 712,710,708,706,704,702,700,699
- .short 697,695,693,691,689,688,686,684
- .short 682,680,679,677,675,673,672,670
- .short 668,667,665,663,661,660,658,657
- .short 655,653,652,650,648,647,645,644
- .short 642,640,639,637,636,634,633,631
- .short 630,628,627,625,624,622,621,619
- .short 618,616,615,613,612,611,609,608
- .short 606,605,604,602,601,599,598,597
- .short 595,594,593,591,590,589,587,586
- .short 585,583,582,581,579,578,577,576
- .short 574,573,572,571,569,568,567,566
- .short 564,563,562,561,560,558,557,556
- .short 555,554,553,551,550,549,548,547
- .short 546,544,543,542,541,540,539,538
- .short 537,536,534,533,532,531,530,529
- .short 528,527,526,525,524,523,522,521
- .short 520,519,518,517,516,515,514,513
+ .short 0x7fd,0x7f5,0x7ed,0x7e5,0x7dd,0x7d5,0x7ce,0x7c6
+ .short 0x7bf,0x7b7,0x7b0,0x7a8,0x7a1,0x79a,0x792,0x78b
+ .short 0x784,0x77d,0x776,0x76f,0x768,0x761,0x75b,0x754
+ .short 0x74d,0x747,0x740,0x739,0x733,0x72c,0x726,0x720
+ .short 0x719,0x713,0x70d,0x707,0x700,0x6fa,0x6f4,0x6ee
+ .short 0x6e8,0x6e2,0x6dc,0x6d6,0x6d1,0x6cb,0x6c5,0x6bf
+ .short 0x6ba,0x6b4,0x6ae,0x6a9,0x6a3,0x69e,0x698,0x693
+ .short 0x68d,0x688,0x683,0x67d,0x678,0x673,0x66e,0x669
+ .short 0x664,0x65e,0x659,0x654,0x64f,0x64a,0x645,0x640
+ .short 0x63c,0x637,0x632,0x62d,0x628,0x624,0x61f,0x61a
+ .short 0x616,0x611,0x60c,0x608,0x603,0x5ff,0x5fa,0x5f6
+ .short 0x5f1,0x5ed,0x5e9,0x5e4,0x5e0,0x5dc,0x5d7,0x5d3
+ .short 0x5cf,0x5cb,0x5c6,0x5c2,0x5be,0x5ba,0x5b6,0x5b2
+ .short 0x5ae,0x5aa,0x5a6,0x5a2,0x59e,0x59a,0x596,0x592
+ .short 0x58e,0x58a,0x586,0x583,0x57f,0x57b,0x577,0x574
+ .short 0x570,0x56c,0x568,0x565,0x561,0x55e,0x55a,0x556
+ .short 0x553,0x54f,0x54c,0x548,0x545,0x541,0x53e,0x53a
+ .short 0x537,0x534,0x530,0x52d,0x52a,0x526,0x523,0x520
+ .short 0x51c,0x519,0x516,0x513,0x50f,0x50c,0x509,0x506
+ .short 0x503,0x500,0x4fc,0x4f9,0x4f6,0x4f3,0x4f0,0x4ed
+ .short 0x4ea,0x4e7,0x4e4,0x4e1,0x4de,0x4db,0x4d8,0x4d5
+ .short 0x4d2,0x4cf,0x4cc,0x4ca,0x4c7,0x4c4,0x4c1,0x4be
+ .short 0x4bb,0x4b9,0x4b6,0x4b3,0x4b0,0x4ad,0x4ab,0x4a8
+ .short 0x4a5,0x4a3,0x4a0,0x49d,0x49b,0x498,0x495,0x493
+ .short 0x490,0x48d,0x48b,0x488,0x486,0x483,0x481,0x47e
+ .short 0x47c,0x479,0x477,0x474,0x472,0x46f,0x46d,0x46a
+ .short 0x468,0x465,0x463,0x461,0x45e,0x45c,0x459,0x457
+ .short 0x455,0x452,0x450,0x44e,0x44b,0x449,0x447,0x444
+ .short 0x442,0x440,0x43e,0x43b,0x439,0x437,0x435,0x432
+ .short 0x430,0x42e,0x42c,0x42a,0x428,0x425,0x423,0x421
+ .short 0x41f,0x41d,0x41b,0x419,0x417,0x414,0x412,0x410
+ .short 0x40e,0x40c,0x40a,0x408,0x406,0x404,0x402,0x400
END_OBJECT(approx_tab)
ASM_END()
diff -r 04abb4422ccb -r e7fa613f9dda mpn/x86_64/invert_limb.asm
--- a/mpn/x86_64/invert_limb.asm Sat Mar 20 09:59:39 2010 +0100
+++ b/mpn/x86_64/invert_limb.asm Sat Mar 20 17:41:00 2010 +0100
@@ -63,7 +63,7 @@
dec R32(%rax)
sub R32(%rcx), R32(%rax) C %rax = v1
- C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47
+ C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
mov $0x1000000000000000, %rcx
imul %rax, %rsi C 14 17 13
sub %rsi, %rcx
@@ -72,7 +72,7 @@
shr $47, %rcx
add %rax, %rcx C %rcx = v2
- C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + (v2>>1) & mask) >> 65
+ C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + ((v2 >> 1) & mask)) >> 65
mov %rdi, %rsi C 0 0 0
shr $1, %rsi C d/2
sbb %rax, %rax C -d0 = -(d mod 2)
More information about the gmp-commit
mailing list