Build failure for 5.1.0-RC2 on Mac OS 10.8

Wed Dec 12 02:23:08 CET 2012

On Tue, Dec 11, 2012 at 06:20:22PM -0500, Jack Howarth wrote:
> On Tue, Dec 11, 2012 at 11:15:35PM +0100, Torbjorn Granlund wrote:
> > Jack Howarth <howarth at bromo.med.uc.edu> writes:
> > 
> >   +  .set LC1, L1-Ltab
> >   +  .long	LC1
> >   
> >   and his reasoning as follows...
> >   
> >   --------------------------------------------------
> >   
> >   The deal is, you want the assembler to evaluate the deltas and not
> >   pass relocations on to the linker. You can see the compiler doing this
> >   is you just compile a simple switch like:
> >   
> > Thank for the information.
> > 
> > This behaviour is highly unusual, and I know not all Apple assembler
> > releases require that, and I would expect some Apple assemblers to
> > actually reject that syntax.
> 
> Torbjorn,
>    I can confirm that the attached mod_34lsub1.fixed.s can be compiled
> with 'as -arch x86_64' on the following Apple Xcode releases...
> 
> Xcode 4.5.2 on darwin12
> Xcode 4.5.2 on darwin11
> Xcode 4.2 on darwin10
> Xcode 3.2.6 on darwin10
> Xcode 3.1.4 on darwin9
> 
> I don't have darwin8 available to check Xcode 2.5 but David Fang
> can probably check that for us.
>              Jack
> 

Torbjorn,
    The attached version of mod_34lsub1.asm adds the missing LC0 entry and now allows all of make check to pass
on x86_64-apple-darwin12. Compared to the copy in gmp-5.1.0-RC2, the diff is...

--- mod_34lsub1.asm.orig	2012-12-09 16:29:52.000000000 -0500
+++ mod_34lsub1.asm	2012-12-11 20:13:44.000000000 -0500
@@ -107,18 +107,25 @@
 	lea	(%r10, %r8), %r8
 	jmp	*%r8
 
-	RODATA
-L(tab):	.long	L(0)-L(tab)
-	.long	L(1)-L(tab)
-	.long	L(2)-L(tab)
-	.long	L(3)-L(tab)
-	.long	L(4)-L(tab)
-	.long	L(5)-L(tab)
-	.long	L(6)-L(tab)
-	.long	L(7)-L(tab)
-	.long	L(8)-L(tab)
+Ltab:   .set LC0, L0-Ltab
+	.long        LC0
+	.set LC1, L1-Ltab
+	.long        LC1
+	.set LC2, L2-Ltab
+	.long        LC2
+	.set LC3, L3-Ltab
+	.long        LC3
+	.set LC4, L4-Ltab
+	.long        LC4
+	.set LC5, L5-Ltab
+	.long        LC5
+	.set LC6, L6-Ltab
+	.long        LC6
+	.set LC7, L7-Ltab
+	.long        LC7
+	.set LC8, L8-Ltab
+	.long        LC8
 
-	TEXT
 L(6):	add	(ap), %rax
 	adc	8(ap), %rcx
 	adc	16(ap), %rdx

I suspect this should work fine on all darwin (although someone should check darwin8).
           Jack

> > 
> > It should be noted that no other concurrent assemblers fail to compute
> > sym1-sym2 when both symbols are in the same segment.  They don't
> > require any contortions in order to trigger symbol arithmetic that is
> > fully foldable.
> > 
> > We know that the code of RC1 works with some Apple assemblers/linkers,
> > but not with ones older and newer.  This make me take an extremely
> > conservative stance wrt Apple porting.
> > 
> > We will never get stable Apple support, however hard we try.  Only if
> > Apple decided for what to support, and wrote a good test suite
> > enforcing the decision, and educated their staff, than we could reach
> > the same GMP quality with Apple as with other platforms.  But the
> > damage is already done with the mess of inconsistent tools out there.
> > 
> > -- 
> > Torbjörn

> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 	.text
> 	.align	5, 0x90
> 	.globl	___gmpn_mod_34lsub1
> 	
> ___gmpn_mod_34lsub1:
> 
> 	
> 
> 	mov	$0x0000FFFFFFFFFFFF, %r11
> 
> 	mov	(%rdi), %rax
> 
> 	cmp	$2, %rsi
> 	ja	Lgt2
> 
> 	jb	Lone
> 
> 	mov	8(%rdi), %rsi
> 	mov	%rax, %rdx
> 	shr	$48, %rax		
> 
> 	and	%r11, %rdx		
> 	add	%rdx, %rax
> 	mov	%esi, %edx
> 
> 	shr	$32, %rsi		
> 	add	%rsi, %rax
> 
> 	shl	$16, %rdx		
> 	add	%rdx, %rax
> Lone:	
> 	ret
> 
> 
> 
> 
> 
> Lgt2:	mov	8(%rdi), %rcx
> 	mov	16(%rdi), %rdx
> 	xor	%r9, %r9
> 	add	$24, %rdi
> 	sub	$12, %rsi
> 	jc	Lend
> 	.align	4, 0x90
> Ltop:
> 	add	(%rdi), %rax
> 	adc	8(%rdi), %rcx
> 	adc	16(%rdi), %rdx
> 	adc	$0, %r9
> 	add	24(%rdi), %rax
> 	adc	32(%rdi), %rcx
> 	adc	40(%rdi), %rdx
> 	adc	$0, %r9
> 	add	48(%rdi), %rax
> 	adc	56(%rdi), %rcx
> 	adc	64(%rdi), %rdx
> 	adc	$0, %r9
> 	add	$72, %rdi
> 	sub	$9, %rsi
> 	jnc	Ltop
> 
> Lend:	lea	Ltab(%rip), %r8
> 
> 
> 	movslq	36(%r8,%rsi,4), %r10
> 	lea	(%r10, %r8), %r8
> 	jmp	*%r8
> 
> Ltab:	
>   .set LC1, L1-Ltab
>   .long	LC1
>   .set LC2, L2-Ltab
>   .long	LC2
>   .set LC3, L3-Ltab
>   .long	LC3
>   .set LC4, L4-Ltab
>   .long	LC4
>   .set LC5, L5-Ltab
>   .long	LC5
>   .set LC6, L6-Ltab
>   .long	LC6
>   .set LC7, L7-Ltab
>   .long	LC7
>   .set LC8, L8-Ltab
>   .long	LC8
> 
> 
> L6:	add	(%rdi), %rax
> 	adc	8(%rdi), %rcx
> 	adc	16(%rdi), %rdx
> 	adc	$0, %r9
> 	add	$24, %rdi
> L3:	add	(%rdi), %rax
> 	adc	8(%rdi), %rcx
> 	adc	16(%rdi), %rdx
> 	jmp	Lcj1
> 
> L7:	add	(%rdi), %rax
> 	adc	8(%rdi), %rcx
> 	adc	16(%rdi), %rdx
> 	adc	$0, %r9
> 	add	$24, %rdi
> L4:	add	(%rdi), %rax
> 	adc	8(%rdi), %rcx
> 	adc	16(%rdi), %rdx
> 	adc	$0, %r9
> 	add	$24, %rdi
> L1:	add	(%rdi), %rax
> 	adc	$0, %rcx
> 	jmp	Lcj2
> 
> L8:	add	(%rdi), %rax
> 	adc	8(%rdi), %rcx
> 	adc	16(%rdi), %rdx
> 	adc	$0, %r9
> 	add	$24, %rdi
> L5:	add	(%rdi), %rax
> 	adc	8(%rdi), %rcx
> 	adc	16(%rdi), %rdx
> 	adc	$0, %r9
> 	add	$24, %rdi
> L2:	add	(%rdi), %rax
> 	adc	8(%rdi), %rcx
> 
> Lcj2:	adc	$0, %rdx
> Lcj1:	adc	$0, %r9
> L0:	add	%r9, %rax
> 	adc	$0, %rcx
> 	adc	$0, %rdx
> 	adc	$0, %rax
> 
> 	mov	%rax, %rdi		
> 	shr	$48, %rax		
> 
> 	and	%r11, %rdi		
> 	mov	%ecx, %r10d	
> 
> 	shr	$32, %rcx		
> 
> 	add	%rdi, %rax		
> 	movzwl	%dx, %edi		
> 	shl	$16, %r10		
> 
> 	add	%rcx, %rax		
> 	shr	$16, %rdx		
> 
> 	add	%r10, %rax		
> 	shl	$32, %rdi		
> 
> 	add	%rdx, %rax		
> 	add	%rdi, %rax		
> 
> 	
> 	ret
> 	

> _______________________________________________
> gmp-bugs mailing list
> gmp-bugs at gmplib.org
> https://gmplib.org/mailman/listinfo/gmp-bugs

-------------- next part --------------
dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.

dnl  Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010, 2011, 2012 Free
dnl  Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')


C	    cycles/limb
C AMD K8,K9	 0.67	   0.583 is possible with zero-reg instead of $0, 4-way
C AMD K10	 0.67	   this seems hard to beat
C AMD bd1	 1
C AMD bobcat	 1.07
C Intel P4	 7.35	   terrible, use old code
C Intel core2	 1.25	   1+epsilon with huge unrolling
C Intel NHM	 1.15	   this seems hard to beat
C Intel SBR	 0.93
C Intel atom	 2.5
C VIA nano	 1.25	   this seems hard to beat

C INPUT PARAMETERS
define(`ap',	%rdi)
define(`n',	%rsi)

C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)

C TODO
C  * Review feed-in and wind-down code.

ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)

ASM_START()
	TEXT
	ALIGN(32)
PROLOGUE(mpn_mod_34lsub1)
	FUNC_ENTRY(2)

	mov	$0x0000FFFFFFFFFFFF, %r11

	mov	(ap), %rax

	cmp	$2, %rsi
	ja	L(gt2)

	jb	L(one)

	mov	8(ap), %rsi
	mov	%rax, %rdx
	shr	$48, %rax		C src[0] low

	and	%r11, %rdx		C src[0] high
	add	%rdx, %rax
	mov	R32(%rsi), R32(%rdx)

	shr	$32, %rsi		C src[1] high
	add	%rsi, %rax

	shl	$16, %rdx		C src[1] low
	add	%rdx, %rax
L(one):	FUNC_EXIT()
	ret


C Don't change this, the wind-down code is not able to handle greater values
define(UNROLL,3)

L(gt2):	mov	8(ap), %rcx
	mov	16(ap), %rdx
	xor	%r9, %r9
	add	$24, ap
	sub	$eval(UNROLL*3+3), %rsi
	jc	L(end)
	ALIGN(16)
L(top):
	add	(ap), %rax
	adc	8(ap), %rcx
	adc	16(ap), %rdx
	adc	$0, %r9
forloop(i,1,UNROLL-1,`dnl
	add	eval(i*24)(ap), %rax
	adc	eval(i*24+8)(ap), %rcx
	adc	eval(i*24+16)(ap), %rdx
	adc	$0, %r9
')dnl
	add	$eval(UNROLL*24), ap
	sub	$eval(UNROLL*3), %rsi
	jnc	L(top)

L(end):	LEA(	L(tab), %r8)
	movslq	36(%r8,%rsi,4), %r10
	lea	(%r10, %r8), %r8
	jmp	*%r8

Ltab:   .set LC0, L0-Ltab
	.long        LC0
	.set LC1, L1-Ltab
	.long        LC1
	.set LC2, L2-Ltab
	.long        LC2
	.set LC3, L3-Ltab
	.long        LC3
	.set LC4, L4-Ltab
	.long        LC4
	.set LC5, L5-Ltab
	.long        LC5
	.set LC6, L6-Ltab
	.long        LC6
	.set LC7, L7-Ltab
	.long        LC7
	.set LC8, L8-Ltab
	.long        LC8

L(6):	add	(ap), %rax
	adc	8(ap), %rcx
	adc	16(ap), %rdx
	adc	$0, %r9
	add	$24, ap
L(3):	add	(ap), %rax
	adc	8(ap), %rcx
	adc	16(ap), %rdx
	jmp	L(cj1)

L(7):	add	(ap), %rax
	adc	8(ap), %rcx
	adc	16(ap), %rdx
	adc	$0, %r9
	add	$24, ap
L(4):	add	(ap), %rax
	adc	8(ap), %rcx
	adc	16(ap), %rdx
	adc	$0, %r9
	add	$24, ap
L(1):	add	(ap), %rax
	adc	$0, %rcx
	jmp	L(cj2)

L(8):	add	(ap), %rax
	adc	8(ap), %rcx
	adc	16(ap), %rdx
	adc	$0, %r9
	add	$24, ap
L(5):	add	(ap), %rax
	adc	8(ap), %rcx
	adc	16(ap), %rdx
	adc	$0, %r9
	add	$24, ap
L(2):	add	(ap), %rax
	adc	8(ap), %rcx

L(cj2):	adc	$0, %rdx
L(cj1):	adc	$0, %r9
L(0):	add	%r9, %rax
	adc	$0, %rcx
	adc	$0, %rdx
	adc	$0, %rax

	mov	%rax, %rdi		C 0mod3
	shr	$48, %rax		C 0mod3 high

	and	%r11, %rdi		C 0mod3 low
	mov	R32(%rcx), R32(%r10)	C 1mod3

	shr	$32, %rcx		C 1mod3 high

	add	%rdi, %rax		C apply 0mod3 low
	movzwl	%dx, R32(%rdi)		C 2mod3
	shl	$16, %r10		C 1mod3 low

	add	%rcx, %rax		C apply 1mod3 high
	shr	$16, %rdx		C 2mod3 high

	add	%r10, %rax		C apply 1mod3 low
	shl	$32, %rdi		C 2mod3 low

	add	%rdx, %rax		C apply 2mod3 high
	add	%rdi, %rax		C apply 2mod3 low

	FUNC_EXIT()
	ret
EPILOGUE()