[Gmp-commit] /var/hg/gmp: 6 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Thu Mar 21 23:10:12 CET 2013
details: /var/hg/gmp/rev/2528ba817c41
changeset: 15627:2528ba817c41
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 21 23:03:07 2013 +0100
description:
Move veriables out from generated file, make them 'const'.
details: /var/hg/gmp/rev/f89b56f2cade
changeset: 15628:f89b56f2cade
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 21 23:03:41 2013 +0100
description:
Add and correct cycle numbers.
details: /var/hg/gmp/rev/4d3c8f773d49
changeset: 15629:4d3c8f773d49
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 21 23:04:23 2013 +0100
description:
Fix a comment typo.
details: /var/hg/gmp/rev/02737f9759a9
changeset: 15630:02737f9759a9
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 21 23:06:29 2013 +0100
description:
Trim 'sqr_diag_addlsh1' loop.
details: /var/hg/gmp/rev/8a8826f8459d
changeset: 15631:8a8826f8459d
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 21 23:09:46 2013 +0100
description:
Add arm/neon lshiftc support.
details: /var/hg/gmp/rev/4b32bd60c390
changeset: 15632:4b32bd60c390
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Mar 21 23:10:04 2013 +0100
description:
ChangeLog
diffstat:
ChangeLog | 10 +
gen-trialdivtab.c | 18 +-
longlong.h | 2 +-
mpn/arm/mode1o.asm | 4 +-
mpn/arm/neon/lshiftc.asm | 245 ++++++++++++++++++++++++++++++++++++++++++++
mpn/arm/v6/sqr_basecase.asm | 6 +-
mpn/generic/trialdiv.c | 16 ++-
7 files changed, 287 insertions(+), 14 deletions(-)
diffs (truncated from 417 to 300 lines):
diff -r 055b847ac3d7 -r 4b32bd60c390 ChangeLog
--- a/ChangeLog Wed Mar 20 18:40:58 2013 +0100
+++ b/ChangeLog Thu Mar 21 23:10:04 2013 +0100
@@ -1,3 +1,13 @@
+2013-03-21 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/arm/neon/lshiftc.asm: New file.
+
+ * mpn/arm/v6/sqr_basecase.asm: Trim 'sqr_diag_addlsh1' loop.
+
+ * gen-trialdivtab.c: Output just raw data, remove actual variables.
+ * mpn/generic/trialdiv.c: Put variables from gen-trialdivtab.c here,
+ and make them 'const'.
+
2013-03-20 Torbjorn Granlund <tege at gmplib.org>
* config.guess: Rework arm CPU recognition.
diff -r 055b847ac3d7 -r 4b32bd60c390 gen-trialdivtab.c
--- a/gen-trialdivtab.c Wed Mar 20 18:40:58 2013 +0100
+++ b/gen-trialdivtab.c Thu Mar 21 23:10:04 2013 +0100
@@ -2,7 +2,7 @@
Contributed to the GNU project by Torbjorn Granlund.
-Copyright 2009, 2012 Free Software Foundation, Inc.
+Copyright 2009, 2012, 2013 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -98,7 +98,9 @@
omitted_p = 3;
interval_end = 0;
- printf ("static struct gmp_primes_dtab gmp_primes_dtab[] = {\n");
+/* printf ("static struct gmp_primes_dtab gmp_primes_dtab[] = {\n"); */
+
+ printf ("#ifdef WANT_dtab\n");
for (t = start_p; t <= end_p; t += 2)
{
@@ -120,7 +122,7 @@
if (! isprime (p))
continue;
- printf (" P(%d,", (int) p);
+ printf (" P(%d,", (int) p);
mpz_invert_ui_2exp (inv, p, limb_bits);
printf ("CNST_LIMB(0x"); mpz_out_str (stdout, 16, inv); printf ("),");
@@ -138,10 +140,12 @@
}
interval_end = t;
}
- printf (" P(0,0,0)\n};\n");
+ printf ("#define SMALLEST_OMITTED_PRIME %d\n", (int) omitted_p);
+ printf ("#endif\n");
+ printf ("#ifdef WANT_ptab\n");
- printf ("static struct gmp_primes_ptab gmp_primes_ptab[] = {\n");
+/* printf ("static struct gmp_primes_ptab gmp_primes_ptab[] = {\n"); */
endtok = "";
@@ -193,9 +197,9 @@
interval_end = t;
np++;
}
- printf ("\n};\n");
- printf ("#define SMALLEST_OMITTED_PRIME %d\n", (int) omitted_p);
+ printf ("\n");
+ printf ("#endif\n");
return 0;
}
diff -r 055b847ac3d7 -r 4b32bd60c390 longlong.h
--- a/longlong.h Wed Mar 20 18:40:58 2013 +0100
+++ b/longlong.h Thu Mar 21 23:10:04 2013 +0100
@@ -514,7 +514,7 @@
#define UDIV_TIME 200
#endif /* LONGLONG_STANDALONE */
#endif
-/* This is a bizarre test, but GCC doesn't define useful common symbol. */
+/* This is a bizarre test, but GCC doesn't define any useful common symbol. */
#if defined (__ARM_ARCH_5__) || defined (__ARM_ARCH_5T__) || \
defined (__ARM_ARCH_5E__) || defined (__ARM_ARCH_5TE__)|| \
defined (__ARM_ARCH_6__) || defined (__ARM_ARCH_6J__) || \
diff -r 055b847ac3d7 -r 4b32bd60c390 mpn/arm/mode1o.asm
--- a/mpn/arm/mode1o.asm Wed Mar 20 18:40:58 2013 +0100
+++ b/mpn/arm/mode1o.asm Thu Mar 21 23:10:04 2013 +0100
@@ -25,8 +25,8 @@
C StrongARM ?
C XScale ?
C Cortex-A8 ?
-C Cortex-A9 10
-C Cortex-A15 ?
+C Cortex-A9 9
+C Cortex-A15 7
define(`up', `r0')
define(`n', `r1')
diff -r 055b847ac3d7 -r 4b32bd60c390 mpn/arm/neon/lshiftc.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/neon/lshiftc.asm Thu Mar 21 23:10:04 2013 +0100
@@ -0,0 +1,245 @@
+dnl ARM Neon mpn_lshiftc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C StrongARM n/a
+C XScale n/a
+C Cortex-A8 ?
+C Cortex-A9 3.5 3.5 Y
+C Cortex-A15 1.75 1.75 Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses. All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands. Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts,
+C which might make it tricky.
+C * Clean up and simplify.
+C * Consider sharing most of the code for lshift and rshift, since the feed-in code,
+C the loop, and most of the wind-down code are identical.
+C * Replace the basecase code with code using 'extension' registers.
+C * Optimise. It is not clear that this loop insn permutation is optimal for
+C either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp', `r0')
+define(`ap', `r1')
+define(`n', `r2')
+define(`cnt', `r3')
+
+ define(`IFLSH', `$1')
+ define(`IFRSH', `')
+ define(`X',`0')
+ define(`Y',`1')
+ define(`func',`mpn_lshiftc')
+define(`OPERATION_lshiftc',1)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_lshiftc)
+IFLSH(` mov r12, n, lsl #2 ')
+IFLSH(` add rp, rp, r12 ')
+IFLSH(` add ap, ap, r12 ')
+
+ cmp n, #4 C SIMD code n limit
+ ble L(base)
+
+ifdef(`OPERATION_lshiftc',`
+ vdup.32 d6, r3 C left shift count is positive
+ sub r3, r3, #64 C right shift count is negative
+ vdup.32 d7, r3
+ mov r12, #-8') C lshift pointer update offset
+ifdef(`OPERATION_rshift',`
+ rsb r3, r3, #0 C right shift count is negative
+ vdup.32 d6, r3
+ add r3, r3, #64 C left shift count is positive
+ vdup.32 d7, r3
+ mov r12, #8') C rshift pointer update offset
+
+IFLSH(` sub ap, ap, #8 ')
+ vld1.32 {d19}, [ap], r12 C load initial 2 limbs
+ vshl.u64 d18, d19, d7 C retval
+
+ tst rp, #4 C is rp 64-bit aligned already?
+ beq L(rp_aligned) C yes, skip
+ vmvn d19, d19
+IFLSH(` add ap, ap, #4 ') C move back ap pointer
+IFRSH(` sub ap, ap, #4 ') C move back ap pointer
+ vshl.u64 d4, d19, d6
+ sub n, n, #1 C first limb handled
+IFLSH(` sub rp, rp, #4 ')
+ vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned
+ vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2]
+
+L(rp_aligned):
+IFLSH(` sub rp, rp, #8 ')
+ subs n, n, #6
+ vmvn d19, d19
+ blt L(two_or_three_more)
+ tst n, #2
+ beq L(2)
+
+L(1): vld1.32 {d17}, [ap], r12
+ vshl.u64 d5, d19, d6
+ vmvn d17, d17
+ vld1.32 {d16}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ sub n, n, #2
+ b L(mid)
+
+L(2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vmvn d16, d16
+ vld1.32 {d17}, [ap], r12
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ subs n, n, #4
+ blt L(end)
+
+L(top): vmvn d17, d17
+ vld1.32 {d16}, [ap], r12
+ vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+L(mid): vmvn d16, d16
+ vld1.32 {d17}, [ap], r12
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ subs n, n, #4
+ bge L(top)
+
+L(end): tst n, #1
+ beq L(evn)
+
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+ b L(cj1)
+
+L(evn): vmvn d17, d17
+ vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+ vmvn.u8 d17, #0
+ vorr d2, d5, d0
+ vshl.u64 d0, d17, d7
+ vorr d3, d4, d0
+ b L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+ tst n, #1
+ beq L(l2)
+
+L(l3): vshl.u64 d5, d19, d6
+ vld1.32 {d17}, [ap], r12
+L(cj1): vmov.u8 d16, #0
+IFLSH(` add ap, ap, #4 ')
+ vmvn d17, d17
+ vld1.32 {d16[Y]}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vmvn d16, d16
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+IFLSH(` add rp, rp, #4 ')
+ vst1.32 {d5[Y]}, [rp]
+ vmov.32 r0, d18[X]
+ bx lr
+
+L(l2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vmvn d16, d16
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vmvn.u8 d17, #0
+ vorr d2, d4, d1
More information about the gmp-commit
mailing list