FAT GMP 5 binaries
Kevin Ryde
user42@zip.com.au
Mon, 19 May 2003 09:12:17 +1000
--=-=-=
Torbjorn Granlund <tege@swox.com> writes:
>
> For example, the function mpn_addmul_1 would tailcall
> *(__gmp_cpuvec[RNUM_GMP_ADDMUL_1]).
Or a struct for more type checking. Probably wise to do the jumping
functions in assembler, to be certain of getting the desired code.
> We should recognize most known processors, and chose our
> specially tailored routines for those. Such routines would be
> called mpn_addmul_1_athlon, mpn_addmul_1_k6,
> mpn_addmul_1_pentium4, etc.
I'm thinking of some sort of fat_path in configure which would link
found asm files into the build directory under names like
p4_addmul_1.asm, etc. Will have to exclude mmx/sse2 if the assembler
can't handle it.
Probably have a WANT_FAT_BINARIES to tell asm-defs.m4 to add a suffix
to the function names. The asm files themselves will probably have to
declare what processor they're for. (The alternative is lots of build
rules in mpn/Makefile.am to mung the function names.)
> We need to decide what routines to put in __gmp_cpuvec. All GMP
> rouines is one choice. All mpn routines or a subset of mpn
> rouines are other choices.
Might be worth starting with just a few. Some sample code below
(untested).
> With some special mechanism, we could actually allow run-time
> selection of vectors from timing tests, usable for processors
> that we didn't know at the time of a release. That should be
> right down your alley. :-)
Me? Hehe. That could be within the realms of reason actually. The
full tune/speed stuff is pretty big and ugly, but just comparing a few
routines with rdtsc couldn't be too bad.
--=-=-=
Content-Type: text/x-csrc
Content-Disposition: attachment; filename=fat.c
#include <string.h>
#include "gmp.h"
#include "gmp-impl.h"
/* config.h */
/* #define WANT_FAT_BINARY 1 */
/* gmp-impl.h */
#if WANT_FAT_BINARY
/* NOTE: The layout of this structure is also mpn/asm-defs.m4.
Be sure to keep them the same when making any changes. */
struct cpuvec_t {
mp_limb_t (*addmul_1) __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t,
mp_limb_t));
void (*mul_basecase) __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t,
mp_srcptr, mp_size_t));
void (*sqr_basecase) __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
mp_limb_t (*submul_1) __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t,
mp_limb_t));
};
extern struct cpuvec_t __gmpn_cpuvec;
/* direct calls through __gmpn_cpuvec */
#undef mpn_addmul_1
#define mpn_addmul_1 (*__gmpn_cpuvec.addmul_1)
#undef mpn_mul_basecase
#define mpn_mul_basecase (*__gmpn_cpuvec.mul_basecase)
#undef mpn_sqr_basecase
#define mpn_sqr_basecase (*__gmpn_cpuvec.sqr_basecase)
#undef mpn_submul_1
#define mpn_submul_1 (*__gmpn_cpuvec.submul_1)
#endif /* WANT_FAT_BINARY */
static mp_limb_t addmul_1_init __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
static void mul_basecase_init __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
static void sqr_basecase_init __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
static mp_limb_t submul_1_init __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
struct cpuvec_t __gmpn_cpuvec = {
submul_1_init,
mul_basecase_init,
sqr_basecase_init,
addmul_1_init,
};
mp_limb_t __gmpn_addmul_1_k6 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
mp_limb_t __gmpn_addmul_1_pentium __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
mp_limb_t __gmpn_addmul_1_x86 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
void __gmpn_mul_basecase_k6 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
void __gmpn_mul_basecase_pentium __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
void __gmpn_mul_basecase_x86 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
void __gmpn_sqr_basecase_k6 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
void __gmpn_sqr_basecase_pentium __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
void __gmpn_sqr_basecase_x86 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
mp_limb_t __gmpn_submul_1_k6 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
mp_limb_t __gmpn_submul_1_pentium __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
mp_limb_t __gmpn_submul_1_x86 __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
static void
cpuvec_init (void)
{
char vendor_string[13];
char dummy_string[12];
long fms;
int family, model;
__gmpn_cpuid (vendor_string, 0);
vendor_string[12] = 0;
fms = __gmpn_cpuid (dummy_string, 1);
family = (fms >> 8) & 15;
model = (fms >> 4) & 15;
if (strcmp (vendor_string, "GenuineIntel") == 0)
{
switch (family)
{
case 5:
if (model <= 2)
{
/* pentium */
__gmpn_cpuvec.addmul_1 = __gmpn_addmul_1_pentium;
__gmpn_cpuvec.mul_basecase = __gmpn_mul_basecase_pentium;
__gmpn_cpuvec.sqr_basecase = __gmpn_sqr_basecase_pentium;
__gmpn_cpuvec.submul_1 = __gmpn_submul_1_pentium;
}
else if (model >= 4)
{
/* pentiummmx */
}
break;
case 6:
if (model == 1)
{
/* pentiumpro */
}
else if (model <= 6)
{
/* pentium2 */
}
else
{
/* pentium3 */
}
break;
case 15:
/* pentium4 */
break;
}
}
else if (strcmp (vendor_string, "AuthenticAMD") == 0)
{
switch (family)
{
case 5:
if (model <= 3)
{
/* k5 */
}
else if (model <= 7)
{
/* k6 */
__gmpn_cpuvec.addmul_1 = __gmpn_addmul_1_k6;
__gmpn_cpuvec.mul_basecase = __gmpn_mul_basecase_k6;
__gmpn_cpuvec.sqr_basecase = __gmpn_sqr_basecase_k6;
__gmpn_cpuvec.submul_1 = __gmpn_submul_1_k6;
}
else if (model <= 8)
{
/* k62 */
}
else if (model <= 9)
{
/* k63 */
}
break;
case 6:
/* athlon */
break;
}
}
else if (strcmp (vendor_string, "CyrixInstead") == 0)
{
/* Should recognize Cyrix' processors too. */
}
/* Fallbacks for anything not already covered.
FIXME: Test the capability bits, and choose routines using those
insns. */
if (__gmpn_cpuvec.addmul_1 == NULL)
__gmpn_cpuvec.addmul_1 = __gmpn_addmul_1_x86;
if (__gmpn_cpuvec.mul_basecase == NULL)
__gmpn_cpuvec.mul_basecase = __gmpn_mul_basecase_x86;
if (__gmpn_cpuvec.sqr_basecase == NULL)
__gmpn_cpuvec.sqr_basecase = __gmpn_sqr_basecase_x86;
if (__gmpn_cpuvec.submul_1 == NULL)
__gmpn_cpuvec.submul_1 = __gmpn_submul_1_x86;
}
static mp_limb_t
addmul_1_init (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t vl)
{
cpuvec_init ();
(*__gmpn_cpuvec.addmul_1) (rp, up, un, vl);
}
static void
mul_basecase_init (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn)
{
cpuvec_init ();
(*__gmpn_cpuvec.mul_basecase) (rp, up, un, vp, vn);
}
static void
sqr_basecase_init (mp_ptr rp, mp_srcptr up, mp_size_t un)
{
cpuvec_init ();
(*__gmpn_cpuvec.sqr_basecase) (rp, up, un);
}
static mp_limb_t
submul_1_init (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t vl)
{
cpuvec_init ();
(*__gmpn_cpuvec.submul_1) (rp, up, un, vl);
}
--=-=-=
Content-Disposition: attachment; filename=fat_entry.asm
dnl x86 fat binary entrypoints.
dnl Copyright 2003 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
dnl Suite 330, Boston, MA 02111-1307, USA.
include(`../config.m4')
dnl Forcibly disable profiling on this file, the routines jumped to will
dnl have any profiling and the code here is so small it's not worth
dnl showing.
define(`WANT_PROFILING',no)
dnl Usage: jumpto(offset)
dnl
dnl Offset should be one of the CPUVEC_OFFSET_<FIELD> values.
define(jumpto,
m4_assert_numargs(1)
`ifdef(`PIC',
` call L(movl_edx_eip)
L(here$1):
addl $_GLOBAL_OFFSET_TABLE_+[.-L(here$1)], %edx
movl __gmpn_cpuvec@GOT(%edx), %edx
jmp *$1(%edx)
',`dnl non-PIC
jmp *__gmpn_cpuvec+$1
')')
define(al,`ifdef(`PIC',16,8)')
TEXT
ALIGN(al)
PROLOGUE(mpn_addmul_1)
jumpto(CPUVEC_OFFSET_ADDMUL_1)
EPILOGUE()
ALIGN(al)
PROLOGUE(mpn_mul_basecase)
jumpto(CPUVEC_OFFSET_MUL_BASECASE)
EPILOGUE()
ALIGN(al)
PROLOGUE(mpn_sqr_basecase)
jumpto(CPUVEC_OFFSET_SQR_BASECASE)
EPILOGUE()
ALIGN(al)
PROLOGUE(mpn_submul_1)
jumpto(CPUVEC_OFFSET_SUBMUL_1)
EPILOGUE()
ifdef(`PIC',`
ALIGN(al)
L(movl_edx_eip):
movl (%esp), %edx
ret_internal
')
--=-=-=--