x86-64 fixes: Part 4

Andreas Jaeger aj@suse.de
Thu, 24 Oct 2002 14:46:31 +0200



These patches populate the new mpn/x86-64 directory with some files.
Most of the files are taken from Athlon and enhanced for x86-64.

Andreas

diff -urN gmp-4.1/mpn/x86-64/gmp-mparam.h /suse/aj/gmp-4.1/mpn/x86-64/gmp-mparam.h
--- gmp-4.1/mpn/x86-64/gmp-mparam.h	Thu Jan  1 01:00:00 1970
+++ /suse/aj/gmp-4.1/mpn/x86-64/gmp-mparam.h	Sun Sep 29 12:33:05 2002
@@ -0,0 +1,59 @@
+/* AMD x86-64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+
+/* Taken from an Athlon to get some values.  */
+
+#define MUL_KARATSUBA_THRESHOLD          26
+#define MUL_TOOM3_THRESHOLD             177
+
+#define SQR_BASECASE_THRESHOLD            0  /* always */
+#define SQR_KARATSUBA_THRESHOLD          52
+#define SQR_TOOM3_THRESHOLD             186
+
+#define DIV_SB_PREINV_THRESHOLD           0  /* always */
+#define DIV_DC_THRESHOLD                 91
+#define POWM_THRESHOLD                  134
+
+#define GCD_ACCEL_THRESHOLD               3
+#define GCDEXT_THRESHOLD                 25
+#define JACOBI_BASE_METHOD                1
+
+#define USE_PREINV_DIVREM_1               1
+#define USE_PREINV_MOD_1                  1  /* native */
+#define DIVREM_2_THRESHOLD                0  /* always */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             22
+#define GET_STR_PRECOMPUTE_THRESHOLD     35
+#define SET_STR_THRESHOLD              5634
+
+#define MUL_FFT_TABLE  { 848, 1696, 3712, 7680, 22528, 57344, 0 }
+#define MUL_FFT_MODF_THRESHOLD          880
+#define MUL_FFT_THRESHOLD              9984
+
+#define SQR_FFT_TABLE  { 784, 1824, 3712, 7680, 22528, 57344, 0 }
+#define SQR_FFT_MODF_THRESHOLD          848
+#define SQR_FFT_THRESHOLD              8448
+
diff -urN gmp-4.1/mpn/x86-64/lshift.asm /suse/aj/gmp-4.1/mpn/x86-64/lshift.asm
--- gmp-4.1/mpn/x86-64/lshift.asm	Thu Jan  1 01:00:00 1970
+++ /suse/aj/gmp-4.1/mpn/x86-64/lshift.asm	Sun Sep 29 16:25:14 2002
@@ -0,0 +1,73 @@
+dnl  x86-64 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1992, 1994, 1996, 1999, 2000, 2001, 2002 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+	C Parameter
+	C rdi	dst
+	C rsi	src
+	C rdx	size
+	C rcx	shift
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_lshift)
+
+	subq	$8,%rsi			C adjust src
+
+	movq	(%rsi,%rdx,8),%r8	C read most significant limb
+	xorq	%rax,%rax
+	shldq	%cl, %r8, %rax		C compute carry limb
+	decq	%rdx
+	jz	L(end)
+	movq	%rax,%r9		C save carry limb
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movq	%r8,%rax
+
+	ALIGN(8)
+L(oop):	movq	(%rsi,%rdx,8),%r8 	C load next lower limb
+	shldq	%cl, %r8, %rax		C compute result limb
+	movq	%rax,(%rdi,%rdx,8)	C store it
+	decq	%rdx
+L(1):	movq	(%rsi,%rdx,8),%rax
+	shldq	%cl, %rax, %r8
+	movq	%r8,(%rdi,%rdx,8)
+	decq	%rdx
+	jnz	L(oop)
+
+	shlq	%cl,%rax		C compute least significant limb
+	movq	%rax,(%rdi)		C store it
+
+	movq	%r9,%rax		C get carry limb
+	ret
+
+L(end):	shlq	%cl,%r8 		C compute least significant limb
+	movq	%r8,(%rdi)		C store it
+
+	ret
+
+EPILOGUE()
diff -urN gmp-4.1/mpn/x86-64/rshift.asm /suse/aj/gmp-4.1/mpn/x86-64/rshift.asm
--- gmp-4.1/mpn/x86-64/rshift.asm	Thu Jan  1 01:00:00 1970
+++ /suse/aj/gmp-4.1/mpn/x86-64/rshift.asm	Sun Sep 29 16:56:00 2002
@@ -0,0 +1,76 @@
+dnl  x86-64 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1992, 1994, 1996, 1999, 2000, 2001, 2002 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+include(`../config.m4')
+
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+	C Parameter
+	C rdi	dst
+	C rsi	src
+	C rdx	size
+	C rcx	shift
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_rshift)
+
+	leaq	-8(%rdi,%rdx,8),%rdi
+	leaq	(%rsi,%rdx,8),%rsi
+	negq	%rdx
+
+	movq	(%rsi,%rdx,8),%r8	C read least significant limb
+	xorq	%rax,%rax
+	shrdq 	%cl, %r8, %rax 		C compute carry limb
+	incq	%rdx
+	jz	L(end)
+	movq	%rax,%r9		C save carry limb
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movq	%r8,%rax
+
+	ALIGN(8)
+L(oop):	movq	(%rsi,%rdx,8),%r8	C load next higher limb
+	shrdq 	%cl, %r8, %rax 		C compute result limb
+	movq	%rax,(%rdi,%rdx,8)	C store it
+	incq	%rdx
+L(1):	movq	(%rsi,%rdx,8),%rax
+	shrdq 	%cl, %rax, %r8
+	movq	%r8,(%rdi,%rdx,8)
+	incq	%rdx
+	jnz	L(oop)
+
+	shrq	%cl,%rax		C compute most significant limb
+	movq	%rax,(%rdi)		C store it
+
+	movq	%r9, %rax		C get carry limb
+	ret
+
+L(end):	shrq	%cl,%r8			C compute most significant limb
+	movq	%r8,(%rdi)		C store it
+
+	ret
+
+EPILOGUE()
diff -urN gmp-4.1/mpn/x86-64/udiv.asm /suse/aj/gmp-4.1/mpn/x86-64/udiv.asm
--- gmp-4.1/mpn/x86-64/udiv.asm	Thu Jan  1 01:00:00 1970
+++ /suse/aj/gmp-4.1/mpn/x86-64/udiv.asm	Sun Sep 29 16:34:48 2002
@@ -0,0 +1,42 @@
+dnl  x86-64 mpn_udiv_qrnnd -- 2 by 1 limb division
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low,
+C                           mp_limb_t divisor);
+
+        C Parameter 
+        C rdi   remptr
+        C rsi   high
+        C rdx   low
+	C rcx	divisor
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_udiv_qrnnd)
+	movq	%rdx, %rax
+	movq	%rsi, %rdx
+	divq	%rcx
+	movq	%rdx,(%rdi)
+	ret
+EPILOGUE()
diff -urN gmp-4.1/mpn/x86-64/umul.asm /suse/aj/gmp-4.1/mpn/x86-64/umul.asm
--- gmp-4.1/mpn/x86-64/umul.asm	Thu Jan  1 01:00:00 1970
+++ /suse/aj/gmp-4.1/mpn/x86-64/umul.asm	Sun Sep 29 16:32:15 2002
@@ -0,0 +1,41 @@
+dnl  mpn_umul_ppmm -- 1x1->2 limb multiplication
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+        C Parameter 
+        C rdi   lowptr 
+        C rsi   m1
+        C rdx   m2
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_umul_ppmm)
+	movq	%rsi,%rax
+	mulq	%rdx
+	movq	%rax, (%rdi)
+	movq	%rdx, %rax
+	ret
+EPILOGUE()
diff -urN gmp-4.1/mpn/x86-64/x86-64-defs.m4 /suse/aj/gmp-4.1/mpn/x86-64/x86-64-defs.m4
--- gmp-4.1/mpn/x86-64/x86-64-defs.m4	Thu Jan  1 01:00:00 1970
+++ /suse/aj/gmp-4.1/mpn/x86-64/x86-64-defs.m4	Mon Aug 19 16:13:36 2002
@@ -0,0 +1,332 @@
+divert(-1)
+
+dnl  m4 macros for x86-64 assembler.
+
+
+dnl  Copyright 2002 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+deflit(BYTES_PER_MP_LIMB, 8)
+
+
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  In the x86 code we use explicit TEXT and ALIGN() calls in the code,
+dnl  since different alignments are wanted in various circumstances.  So for
+dnl  instance,
+dnl
+dnl                  TEXT
+dnl                  ALIGN(16)
+dnl          PROLOGUE(mpn_add_n)
+dnl          ...
+dnl          EPILOGUE()
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+	`GLOBL	$1
+	TYPE($1,`function')
+$1:
+ifelse(WANT_PROFILING,`no',,`call_mcount
+')')
+
+
+dnl  Usage: call_mcount
+dnl
+dnl  For `gprof' style profiling, %ebp is setup as a frame pointer.  None of
+dnl  the assembler routines use %ebp this way, so it's done only for the
+dnl  benefit of mcount.  glibc sysdeps/i386/i386-mcount.S shows how mcount
+dnl  gets the current function from (%esp) and the parent from 4(%ebp).
+dnl
+dnl  For `prof' style profiling gcc generates mcount calls without setting
+dnl  up %ebp, and the same is done here.
+
+define(`call_mcount',
+m4_assert_numargs(-1)
+m4_assert_defined(`WANT_PROFILING')
+m4_assert_defined(`MCOUNT_PIC_REG')
+m4_assert_defined(`MCOUNT_NONPIC_REG')
+m4_assert_defined(`MCOUNT_PIC_CALL')
+m4_assert_defined(`MCOUNT_NONPIC_CALL')
+`ifelse(ifdef(`PIC',`MCOUNT_PIC_REG',`MCOUNT_NONPIC_REG'),,,
+`	DATA
+	ALIGN(4)
+L(mcount_data_`'mcount_data_counter):
+	W32	0
+	TEXT
+')dnl
+ifelse(WANT_PROFILING,`gprof',
+`	pushl	%ebp
+	movl	%esp, %ebp
+')dnl
+ifdef(`PIC',
+`	pushl	%ebx
+	mcount_movl_GOT_ebx
+ifelse(MCOUNT_PIC_REG,,,
+`	leal	L(mcount_data_`'mcount_data_counter)@GOTOFF(%ebx), MCOUNT_PIC_REG')
+MCOUNT_PIC_CALL
+	popl	%ebx
+',`dnl non-PIC
+ifelse(MCOUNT_NONPIC_REG,,,
+`	movl	`$'L(mcount_data_`'mcount_data_counter), MCOUNT_NONPIC_REG
+')dnl
+MCOUNT_NONPIC_CALL
+')dnl
+ifelse(WANT_PROFILING,`gprof',
+`	popl	%ebp
+')
+define(`mcount_data_counter',eval(mcount_data_counter+1))')
+
+define(mcount_data_counter,1)
+
+dnl  Called: mcount_movl_GOT_ebx
+dnl  Label H is "here", the %eip obtained from the call.  C is the called
+dnl  subroutine.  J is the jump across that subroutine.  A fetch and "ret"
+dnl  is always done so calls and returns are balanced for the benefit of the
+dnl  various x86s that have return stack branch prediction.
+define(mcount_movl_GOT_ebx,
+m4_assert_numargs(-1)
+`	call	L(mcount_movl_GOT_ebx_C`'mcount_movl_GOT_ebx_counter)
+L(mcount_movl_GOT_ebx_H`'mcount_movl_GOT_ebx_counter):
+	jmp	L(mcount_movl_GOT_ebx_J`'mcount_movl_GOT_ebx_counter)
+L(mcount_movl_GOT_ebx_C`'mcount_movl_GOT_ebx_counter):
+	movl	(%esp), %ebx
+	ret
+L(mcount_movl_GOT_ebx_J`'mcount_movl_GOT_ebx_counter):
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(mcount_movl_GOT_ebx_H`'mcount_movl_GOT_ebx_counter)], %ebx
+define(`mcount_movl_GOT_ebx_counter',incr(mcount_movl_GOT_ebx_counter))')
+
+define(mcount_movl_GOT_ebx_counter,1)
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various x86 macros.
+dnl
+
+
+dnl  Usage: ALIGN_OFFSET(bytes,offset)
+dnl
+dnl  Align to `offset' away from a multiple of `bytes'.
+dnl
+dnl  This is useful for testing, for example align to something very strict
+dnl  and see what effect offsets from it have, "ALIGN_OFFSET(256,32)".
+dnl
+dnl  Generally you wouldn't execute across the padding, but it's done with
+dnl  nop's so it'll work.
+
+define(ALIGN_OFFSET,
+m4_assert_numargs(2)
+`ALIGN($1)
+forloop(`i',1,$2,`	nop
+')')
+
+
+dnl  Usage: defframe(name,offset)
+dnl
+dnl  Make a definition like the following with which to access a parameter
+dnl  or variable on the stack.
+dnl
+dnl         define(name,`FRAME+offset(%rsp)')
+dnl
+dnl  Actually m4_empty_if_zero(FRAME+offset) is used, which will save one
+dnl  byte if FRAME+offset is zero, by putting (%rsp) rather than 0(%rsp).
+dnl  Use define(`defframe_empty_if_zero_disabled',1) if for some reason the
+dnl  zero offset is wanted.
+dnl
+dnl  The new macro also gets a check that when it's used FRAME is actually
+dnl  defined, and that the final %esp offset isn't negative, which would
+dnl  mean an attempt to access something below the current %esp.
+dnl
+dnl  deflit() is used rather than a plain define(), so the new macro won't
+dnl  delete any following parenthesized expression.  name(%rdi) will come
+dnl  out say as 16(%rsp)(%rdi).  This isn't valid assembler and should
+dnl  provoke an error, which is better than silently giving just 16(%rsp).
+dnl
+dnl  See README for more on the suggested way to access the stack frame.
+
+define(defframe,
+m4_assert_numargs(2)
+`deflit(`$1',
+m4_assert_defined(`FRAME')
+`defframe_check_notbelow(`$1',$2,FRAME)dnl
+defframe_empty_if_zero(FRAME+($2))(%rsp)')')
+
+dnl  Called: defframe_empty_if_zero(expression)
+define(defframe_empty_if_zero,
+m4_assert_numargs(1)
+`ifelse(defframe_empty_if_zero_disabled,1,
+`eval($1)',
+`m4_empty_if_zero($1)')')
+
+dnl  Called: defframe_check_notbelow(`name',offset,FRAME)
+define(defframe_check_notbelow,
+m4_assert_numargs(3)
+`ifelse(eval(($3)+($2)<0),1,
+`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes
+')')')
+
+
+dnl  Usage: FRAME_pushq()
+dnl         FRAME_popq()
+dnl         FRAME_addq_esp(n)
+dnl         FRAME_subq_esp(n)
+dnl
+dnl  Adjust FRAME appropriately for a pushq or popq, or for an addq or subq
+dnl  %rsp of n bytes.
+dnl
+dnl  Using these macros is completely optional.  Sometimes it makes more
+dnl  sense to put explicit deflit(`FRAME',N) forms, especially when there's
+dnl  jumps and different sequences of FRAME values need to be used in
+dnl  different places.
+
+define(FRAME_pushl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+4))')
+
+define(FRAME_popl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-4))')
+
+define(FRAME_addl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-($1)))')
+
+define(FRAME_subl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+($1)))')
+
+
+dnl  Usage: defframe_pushq(name)
+dnl
+dnl  Do a combination FRAME_pushq() and a defframe() to name the stack
+dnl  location just pushed.  This should come after a pushl instruction.
+dnl  Putting it on the same line works and avoids lengthening the code.  For
+dnl  example,
+dnl
+dnl         pushq   %rax     defframe_pushq(VAR_COUNTER)
+dnl
+dnl  Notice the defframe() is done with an unquoted -FRAME thus giving its
+dnl  current value without tracking future changes.
+
+define(defframe_pushl,
+m4_assert_numargs(1)
+`FRAME_pushl()defframe(`$1',-FRAME)')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Assembler instruction macros.
+dnl
+
+
+dnl  Usage: shldl(count,src,dst)
+dnl         shrdl(count,src,dst)
+dnl         shldw(count,src,dst)
+dnl         shrdw(count,src,dst)
+dnl
+dnl  Generate a double-shift instruction, possibly omitting a %cl count
+dnl  parameter if that's what the assembler requires, as indicated by
+dnl  WANT_SHLDL_CL in config.m4.  For example,
+dnl
+dnl         shldl(  %cl, %eax, %ebx)
+dnl
+dnl  turns into either
+dnl
+dnl         shldl   %cl, %eax, %ebx
+dnl  or
+dnl         shldl   %eax, %ebx
+dnl
+dnl  Immediate counts are always passed through unchanged.  For example,
+dnl
+dnl         shrdl(  $2, %esi, %edi)
+dnl  becomes
+dnl         shrdl   $2, %esi, %edi
+dnl
+dnl
+dnl  If you forget to use the macro form "shldl( ...)" and instead write
+dnl  just a plain "shldl ...", an error results.  This ensures the necessary
+dnl  variant treatment of %cl isn't accidentally bypassed.
+
+define(define_shd_instruction,
+m4_assert_numargs(1)
+`define($1,
+m4_instruction_wrapper()
+m4_assert_numargs(3)
+`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl
+m4_doublequote($`'2),m4_doublequote($`'3)))')
+
+dnl  Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc
+define_shd_instruction(shldl)
+define_shd_instruction(shrdl)
+define_shd_instruction(shldw)
+define_shd_instruction(shrdw)
+
+dnl  Called: shd_instruction(op,count,src,dst)
+define(shd_instruction,
+m4_assert_numargs(4)
+m4_assert_defined(`WANT_SHLDL_CL')
+`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1,
+``$1'	`$3', `$4'',
+``$1'	`$2', `$3', `$4'')')
+
+
+dnl  Usage: ASSERT([cond][,instructions])
+dnl
+dnl  If WANT_ASSERT is 1, output the given instructions and expect the given
+dnl  flags condition to then be satisfied.  For example,
+dnl
+dnl         ASSERT(ne, `cmpl %eax, %ebx')
+dnl
+dnl  The instructions can be omitted to just assert a flags condition with
+dnl  no extra calculation.  For example,
+dnl
+dnl         ASSERT(nc)
+dnl
+dnl  When `instructions' is not empty, a pushf/popf is added to preserve the
+dnl  flags, but the instructions themselves must preserve any registers that
+dnl  matter.  FRAME is adjusted for the push and pop, so the instructions
+dnl  given can use defframe() stack variables.
+dnl
+dnl  The condition can be omitted to just output the given instructions when
+dnl  assertion checking is wanted.  In this case the pushf/popf is omitted.
+dnl  For example,
+dnl
+dnl         ASSERT(, `movl %eax, VAR_KEEPVAL')
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+`ifelse(WANT_ASSERT,1,
+`ifelse(`$1',,
+	`$2',
+	`C ASSERT
+ifelse(`$2',,,`	pushf	ifdef(`FRAME',`FRAME_pushl()')')
+	$2
+	j`$1'	L(ASSERT_ok`'ASSERT_counter)
+	ud2	C assertion failed
+L(ASSERT_ok`'ASSERT_counter):
+ifelse(`$2',,,`	popf	ifdef(`FRAME',`FRAME_popl()')')
+define(`ASSERT_counter',incr(ASSERT_counter))')')')
+
+define(ASSERT_counter,1)
+
+divert`'dnl
-- 
 Andreas Jaeger
  SuSE Labs aj@suse.de
   private aj@arthur.inka.de
    http://www.suse.de/~aj