FAT GMP 5 binaries

Kevin Ryde user42@zip.com.au
Mon, 19 May 2003 09:12:17 +1000


--=-=-=

Torbjorn Granlund <tege@swox.com> writes:
>
> For example, the function mpn_addmul_1 would tailcall
> *(__gmp_cpuvec[RNUM_GMP_ADDMUL_1]).

Or a struct for more type checking.  Probably wise to do the jumping
functions in assembler, to be certain of getting the desired code.

> We should recognize most known processors, and chose our
> specially tailored routines for those.  Such routines would be
> called mpn_addmul_1_athlon, mpn_addmul_1_k6,
> mpn_addmul_1_pentium4, etc.

I'm thinking of some sort of fat_path in configure which would link
found asm files into the build directory under names like
p4_addmul_1.asm, etc.  Will have to exclude mmx/sse2 if the assembler
can't handle it.

Probably have a WANT_FAT_BINARIES to tell asm-defs.m4 to add a suffix
to the function names.  The asm files themselves will probably have to
declare what processor they're for.  (The alternative is lots of build
rules in mpn/Makefile.am to mung the function names.)

> We need to decide what routines to put in __gmp_cpuvec.  All GMP
> rouines is one choice.  All mpn routines or a subset of mpn
> rouines are other choices.

Might be worth starting with just a few.  Some sample code below
(untested).

> With some special mechanism, we could actually allow run-time
> selection of vectors from timing tests, usable for processors
> that we didn't know at the time of a release.  That should be
> right down your alley.  :-)

Me?  Hehe.  That could be within the realms of reason actually.  The
full tune/speed stuff is pretty big and ugly, but just comparing a few
routines with rdtsc couldn't be too bad.


--=-=-=
Content-Type: text/x-csrc
Content-Disposition: attachment; filename=fat.c

#include <string.h>

#include "gmp.h"
#include "gmp-impl.h"


/* config.h */
/* #define WANT_FAT_BINARY 1 */


/* gmp-impl.h */

#if WANT_FAT_BINARY
/* NOTE: The layout of this structure is also mpn/asm-defs.m4.
   Be sure to keep them the same when making any changes.  */
struct cpuvec_t {
  mp_limb_t (*addmul_1)     __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t,
                                          mp_limb_t));
  void      (*mul_basecase) __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t,
                                          mp_srcptr, mp_size_t));
  void      (*sqr_basecase) __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
  mp_limb_t (*submul_1)     __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t,
                                          mp_limb_t));
};
extern struct cpuvec_t __gmpn_cpuvec;

/* direct calls through __gmpn_cpuvec */
#undef  mpn_addmul_1
#define mpn_addmul_1      (*__gmpn_cpuvec.addmul_1)
#undef  mpn_mul_basecase
#define mpn_mul_basecase  (*__gmpn_cpuvec.mul_basecase)
#undef  mpn_sqr_basecase
#define mpn_sqr_basecase  (*__gmpn_cpuvec.sqr_basecase)
#undef  mpn_submul_1
#define mpn_submul_1      (*__gmpn_cpuvec.submul_1)
#endif /* WANT_FAT_BINARY */




static mp_limb_t addmul_1_init __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
static void      mul_basecase_init __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
static void      sqr_basecase_init __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
static mp_limb_t submul_1_init __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));

struct cpuvec_t __gmpn_cpuvec = {
  submul_1_init,
  mul_basecase_init,
  sqr_basecase_init,
  addmul_1_init,
};


mp_limb_t __gmpn_addmul_1_k6       __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
mp_limb_t __gmpn_addmul_1_pentium  __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
mp_limb_t __gmpn_addmul_1_x86      __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));

void __gmpn_mul_basecase_k6       __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
void __gmpn_mul_basecase_pentium  __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
void __gmpn_mul_basecase_x86      __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));

void __gmpn_sqr_basecase_k6       __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
void __gmpn_sqr_basecase_pentium  __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
void __gmpn_sqr_basecase_x86      __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));

mp_limb_t __gmpn_submul_1_k6       __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
mp_limb_t __gmpn_submul_1_pentium  __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
mp_limb_t __gmpn_submul_1_x86      __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));

static void
cpuvec_init (void)
{
  char vendor_string[13];
  char dummy_string[12];
  long fms;
  int family, model;

  __gmpn_cpuid (vendor_string, 0);
  vendor_string[12] = 0;

  fms = __gmpn_cpuid (dummy_string, 1);

  family = (fms >> 8) & 15;
  model = (fms >> 4) & 15;

  if (strcmp (vendor_string, "GenuineIntel") == 0)
    {
      switch (family)
	{
	case 5:
	  if (model <= 2)
            {
              /* pentium */
              __gmpn_cpuvec.addmul_1     = __gmpn_addmul_1_pentium;
              __gmpn_cpuvec.mul_basecase = __gmpn_mul_basecase_pentium;
              __gmpn_cpuvec.sqr_basecase = __gmpn_sqr_basecase_pentium;
              __gmpn_cpuvec.submul_1     = __gmpn_submul_1_pentium;
            }
	  else if (model >= 4)
            {
              /* pentiummmx */
            }
	  break;
	case 6:
	  if (model == 1)
            {
              /* pentiumpro */
            }
	  else if (model <= 6)
            {
              /* pentium2 */
            }
	  else
            {
              /* pentium3 */
            }
	  break;
	case 15:
	  /* pentium4 */
	  break;
	}
    }
  else if (strcmp (vendor_string, "AuthenticAMD") == 0)
    {
      switch (family)
	{
	case 5:
	  if (model <= 3)
            {
              /* k5 */
            }
	  else if (model <= 7)
            {
              /* k6 */
              __gmpn_cpuvec.addmul_1     = __gmpn_addmul_1_k6;
              __gmpn_cpuvec.mul_basecase = __gmpn_mul_basecase_k6;
              __gmpn_cpuvec.sqr_basecase = __gmpn_sqr_basecase_k6;
              __gmpn_cpuvec.submul_1     = __gmpn_submul_1_k6;
            }
	  else if (model <= 8)
            {
              /* k62 */
            }
	  else if (model <= 9)
            {
              /* k63 */
            }
	  break;
	case 6:
	  /* athlon */
	  break;
	}
    }
  else if (strcmp (vendor_string, "CyrixInstead") == 0)
    {
      /* Should recognize Cyrix' processors too.  */
    }

  /* Fallbacks for anything not already covered.

     FIXME: Test the capability bits, and choose routines using those
     insns.  */

  if (__gmpn_cpuvec.addmul_1 == NULL)
    __gmpn_cpuvec.addmul_1 = __gmpn_addmul_1_x86;

  if (__gmpn_cpuvec.mul_basecase == NULL)
    __gmpn_cpuvec.mul_basecase = __gmpn_mul_basecase_x86;

  if (__gmpn_cpuvec.sqr_basecase == NULL)
    __gmpn_cpuvec.sqr_basecase = __gmpn_sqr_basecase_x86;

  if (__gmpn_cpuvec.submul_1 == NULL)
    __gmpn_cpuvec.submul_1 = __gmpn_submul_1_x86;
}

static mp_limb_t
addmul_1_init (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t vl)
{
  cpuvec_init ();
  (*__gmpn_cpuvec.addmul_1) (rp, up, un, vl);
}

static void
mul_basecase_init (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn)
{
  cpuvec_init ();
  (*__gmpn_cpuvec.mul_basecase) (rp, up, un, vp, vn);
}

static void
sqr_basecase_init (mp_ptr rp, mp_srcptr up, mp_size_t un)
{
  cpuvec_init ();
  (*__gmpn_cpuvec.sqr_basecase) (rp, up, un);
}

static mp_limb_t
submul_1_init (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t vl)
{
  cpuvec_init ();
  (*__gmpn_cpuvec.submul_1) (rp, up, un, vl);
}

--=-=-=
Content-Disposition: attachment; filename=fat_entry.asm

dnl  x86 fat binary entrypoints.

dnl  Copyright 2003 Free Software Foundation, Inc.
dnl 
dnl  This file is part of the GNU MP Library.
dnl 
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 2.1 of the
dnl  License, or (at your option) any later version.
dnl 
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl 
dnl  You should have received a copy of the GNU Lesser General Public
dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
dnl  Suite 330, Boston, MA 02111-1307, USA.

include(`../config.m4')


dnl  Forcibly disable profiling on this file, the routines jumped to will
dnl  have any profiling and the code here is so small it's not worth
dnl  showing.

define(`WANT_PROFILING',no)


dnl  Usage: jumpto(offset)
dnl
dnl  Offset should be one of the CPUVEC_OFFSET_<FIELD> values.

define(jumpto,
m4_assert_numargs(1)
`ifdef(`PIC',
`	call	L(movl_edx_eip)
L(here$1):
	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here$1)], %edx
	movl	__gmpn_cpuvec@GOT(%edx), %edx
	jmp	*$1(%edx)
',`dnl non-PIC
	jmp	*__gmpn_cpuvec+$1
')')

define(al,`ifdef(`PIC',16,8)')

	TEXT
	ALIGN(al)
PROLOGUE(mpn_addmul_1)
	jumpto(CPUVEC_OFFSET_ADDMUL_1)
EPILOGUE()

	ALIGN(al)
PROLOGUE(mpn_mul_basecase)
	jumpto(CPUVEC_OFFSET_MUL_BASECASE)
EPILOGUE()

	ALIGN(al)
PROLOGUE(mpn_sqr_basecase)
	jumpto(CPUVEC_OFFSET_SQR_BASECASE)
EPILOGUE()

	ALIGN(al)
PROLOGUE(mpn_submul_1)
	jumpto(CPUVEC_OFFSET_SUBMUL_1)
EPILOGUE()


ifdef(`PIC',`
	ALIGN(al)
L(movl_edx_eip):
	movl	(%esp), %edx
	ret_internal
')

--=-=-=--