Build failure for 5.1.0-RC2 on Mac OS 10.8
Jack Howarth
howarth at bromo.med.uc.edu
Wed Dec 12 02:23:08 CET 2012
On Tue, Dec 11, 2012 at 06:20:22PM -0500, Jack Howarth wrote:
> On Tue, Dec 11, 2012 at 11:15:35PM +0100, Torbjorn Granlund wrote:
> > Jack Howarth <howarth at bromo.med.uc.edu> writes:
> >
> > + .set LC1, L1-Ltab
> > + .long LC1
> >
> > and his reasoning as follows...
> >
> > --------------------------------------------------
> >
> > The deal is, you want the assembler to evaluate the deltas and not
> > pass relocations on to the linker. You can see the compiler doing this
> > is you just compile a simple switch like:
> >
> > Thank for the information.
> >
> > This behaviour is highly unusual, and I know not all Apple assembler
> > releases require that, and I would expect some Apple assemblers to
> > actually reject that syntax.
>
> Torbjorn,
> I can confirm that the attached mod_34lsub1.fixed.s can be compiled
> with 'as -arch x86_64' on the following Apple Xcode releases...
>
> Xcode 4.5.2 on darwin12
> Xcode 4.5.2 on darwin11
> Xcode 4.2 on darwin10
> Xcode 3.2.6 on darwin10
> Xcode 3.1.4 on darwin9
>
> I don't have darwin8 available to check Xcode 2.5 but David Fang
> can probably check that for us.
> Jack
>
Torbjorn,
The attached version of mod_34lsub1.asm adds the missing LC0 entry and now allows all of make check to pass
on x86_64-apple-darwin12. Compared to the copy in gmp-5.1.0-RC2, the diff is...
--- mod_34lsub1.asm.orig 2012-12-09 16:29:52.000000000 -0500
+++ mod_34lsub1.asm 2012-12-11 20:13:44.000000000 -0500
@@ -107,18 +107,25 @@
lea (%r10, %r8), %r8
jmp *%r8
- RODATA
-L(tab): .long L(0)-L(tab)
- .long L(1)-L(tab)
- .long L(2)-L(tab)
- .long L(3)-L(tab)
- .long L(4)-L(tab)
- .long L(5)-L(tab)
- .long L(6)-L(tab)
- .long L(7)-L(tab)
- .long L(8)-L(tab)
+Ltab: .set LC0, L0-Ltab
+ .long LC0
+ .set LC1, L1-Ltab
+ .long LC1
+ .set LC2, L2-Ltab
+ .long LC2
+ .set LC3, L3-Ltab
+ .long LC3
+ .set LC4, L4-Ltab
+ .long LC4
+ .set LC5, L5-Ltab
+ .long LC5
+ .set LC6, L6-Ltab
+ .long LC6
+ .set LC7, L7-Ltab
+ .long LC7
+ .set LC8, L8-Ltab
+ .long LC8
- TEXT
L(6): add (ap), %rax
adc 8(ap), %rcx
adc 16(ap), %rdx
I suspect this should work fine on all darwin (although someone should check darwin8).
Jack
> >
> > It should be noted that no other concurrent assemblers fail to compute
> > sym1-sym2 when both symbols are in the same segment. They don't
> > require any contortions in order to trigger symbol arithmetic that is
> > fully foldable.
> >
> > We know that the code of RC1 works with some Apple assemblers/linkers,
> > but not with ones older and newer. This make me take an extremely
> > conservative stance wrt Apple porting.
> >
> > We will never get stable Apple support, however hard we try. Only if
> > Apple decided for what to support, and wrote a good test suite
> > enforcing the decision, and educated their staff, than we could reach
> > the same GMP quality with Apple as with other platforms. But the
> > damage is already done with the mess of inconsistent tools out there.
> >
> > --
> > Torbjörn
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
> .text
> .align 5, 0x90
> .globl ___gmpn_mod_34lsub1
>
> ___gmpn_mod_34lsub1:
>
>
>
> mov $0x0000FFFFFFFFFFFF, %r11
>
> mov (%rdi), %rax
>
> cmp $2, %rsi
> ja Lgt2
>
> jb Lone
>
> mov 8(%rdi), %rsi
> mov %rax, %rdx
> shr $48, %rax
>
> and %r11, %rdx
> add %rdx, %rax
> mov %esi, %edx
>
> shr $32, %rsi
> add %rsi, %rax
>
> shl $16, %rdx
> add %rdx, %rax
> Lone:
> ret
>
>
>
>
>
> Lgt2: mov 8(%rdi), %rcx
> mov 16(%rdi), %rdx
> xor %r9, %r9
> add $24, %rdi
> sub $12, %rsi
> jc Lend
> .align 4, 0x90
> Ltop:
> add (%rdi), %rax
> adc 8(%rdi), %rcx
> adc 16(%rdi), %rdx
> adc $0, %r9
> add 24(%rdi), %rax
> adc 32(%rdi), %rcx
> adc 40(%rdi), %rdx
> adc $0, %r9
> add 48(%rdi), %rax
> adc 56(%rdi), %rcx
> adc 64(%rdi), %rdx
> adc $0, %r9
> add $72, %rdi
> sub $9, %rsi
> jnc Ltop
>
> Lend: lea Ltab(%rip), %r8
>
>
> movslq 36(%r8,%rsi,4), %r10
> lea (%r10, %r8), %r8
> jmp *%r8
>
> Ltab:
> .set LC1, L1-Ltab
> .long LC1
> .set LC2, L2-Ltab
> .long LC2
> .set LC3, L3-Ltab
> .long LC3
> .set LC4, L4-Ltab
> .long LC4
> .set LC5, L5-Ltab
> .long LC5
> .set LC6, L6-Ltab
> .long LC6
> .set LC7, L7-Ltab
> .long LC7
> .set LC8, L8-Ltab
> .long LC8
>
>
> L6: add (%rdi), %rax
> adc 8(%rdi), %rcx
> adc 16(%rdi), %rdx
> adc $0, %r9
> add $24, %rdi
> L3: add (%rdi), %rax
> adc 8(%rdi), %rcx
> adc 16(%rdi), %rdx
> jmp Lcj1
>
> L7: add (%rdi), %rax
> adc 8(%rdi), %rcx
> adc 16(%rdi), %rdx
> adc $0, %r9
> add $24, %rdi
> L4: add (%rdi), %rax
> adc 8(%rdi), %rcx
> adc 16(%rdi), %rdx
> adc $0, %r9
> add $24, %rdi
> L1: add (%rdi), %rax
> adc $0, %rcx
> jmp Lcj2
>
> L8: add (%rdi), %rax
> adc 8(%rdi), %rcx
> adc 16(%rdi), %rdx
> adc $0, %r9
> add $24, %rdi
> L5: add (%rdi), %rax
> adc 8(%rdi), %rcx
> adc 16(%rdi), %rdx
> adc $0, %r9
> add $24, %rdi
> L2: add (%rdi), %rax
> adc 8(%rdi), %rcx
>
> Lcj2: adc $0, %rdx
> Lcj1: adc $0, %r9
> L0: add %r9, %rax
> adc $0, %rcx
> adc $0, %rdx
> adc $0, %rax
>
> mov %rax, %rdi
> shr $48, %rax
>
> and %r11, %rdi
> mov %ecx, %r10d
>
> shr $32, %rcx
>
> add %rdi, %rax
> movzwl %dx, %edi
> shl $16, %r10
>
> add %rcx, %rax
> shr $16, %rdx
>
> add %r10, %rax
> shl $32, %rdi
>
> add %rdx, %rax
> add %rdi, %rax
>
>
> ret
>
> _______________________________________________
> gmp-bugs mailing list
> gmp-bugs at gmplib.org
> https://gmplib.org/mailman/listinfo/gmp-bugs
-------------- next part --------------
dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010, 2011, 2012 Free
dnl Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way
C AMD K10 0.67 this seems hard to beat
C AMD bd1 1
C AMD bobcat 1.07
C Intel P4 7.35 terrible, use old code
C Intel core2 1.25 1+epsilon with huge unrolling
C Intel NHM 1.15 this seems hard to beat
C Intel SBR 0.93
C Intel atom 2.5
C VIA nano 1.25 this seems hard to beat
C INPUT PARAMETERS
define(`ap', %rdi)
define(`n', %rsi)
C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
C TODO
C * Review feed-in and wind-down code.
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_mod_34lsub1)
FUNC_ENTRY(2)
mov $0x0000FFFFFFFFFFFF, %r11
mov (ap), %rax
cmp $2, %rsi
ja L(gt2)
jb L(one)
mov 8(ap), %rsi
mov %rax, %rdx
shr $48, %rax C src[0] low
and %r11, %rdx C src[0] high
add %rdx, %rax
mov R32(%rsi), R32(%rdx)
shr $32, %rsi C src[1] high
add %rsi, %rax
shl $16, %rdx C src[1] low
add %rdx, %rax
L(one): FUNC_EXIT()
ret
C Don't change this, the wind-down code is not able to handle greater values
define(UNROLL,3)
L(gt2): mov 8(ap), %rcx
mov 16(ap), %rdx
xor %r9, %r9
add $24, ap
sub $eval(UNROLL*3+3), %rsi
jc L(end)
ALIGN(16)
L(top):
add (ap), %rax
adc 8(ap), %rcx
adc 16(ap), %rdx
adc $0, %r9
forloop(i,1,UNROLL-1,`dnl
add eval(i*24)(ap), %rax
adc eval(i*24+8)(ap), %rcx
adc eval(i*24+16)(ap), %rdx
adc $0, %r9
')dnl
add $eval(UNROLL*24), ap
sub $eval(UNROLL*3), %rsi
jnc L(top)
L(end): LEA( L(tab), %r8)
movslq 36(%r8,%rsi,4), %r10
lea (%r10, %r8), %r8
jmp *%r8
Ltab: .set LC0, L0-Ltab
.long LC0
.set LC1, L1-Ltab
.long LC1
.set LC2, L2-Ltab
.long LC2
.set LC3, L3-Ltab
.long LC3
.set LC4, L4-Ltab
.long LC4
.set LC5, L5-Ltab
.long LC5
.set LC6, L6-Ltab
.long LC6
.set LC7, L7-Ltab
.long LC7
.set LC8, L8-Ltab
.long LC8
L(6): add (ap), %rax
adc 8(ap), %rcx
adc 16(ap), %rdx
adc $0, %r9
add $24, ap
L(3): add (ap), %rax
adc 8(ap), %rcx
adc 16(ap), %rdx
jmp L(cj1)
L(7): add (ap), %rax
adc 8(ap), %rcx
adc 16(ap), %rdx
adc $0, %r9
add $24, ap
L(4): add (ap), %rax
adc 8(ap), %rcx
adc 16(ap), %rdx
adc $0, %r9
add $24, ap
L(1): add (ap), %rax
adc $0, %rcx
jmp L(cj2)
L(8): add (ap), %rax
adc 8(ap), %rcx
adc 16(ap), %rdx
adc $0, %r9
add $24, ap
L(5): add (ap), %rax
adc 8(ap), %rcx
adc 16(ap), %rdx
adc $0, %r9
add $24, ap
L(2): add (ap), %rax
adc 8(ap), %rcx
L(cj2): adc $0, %rdx
L(cj1): adc $0, %r9
L(0): add %r9, %rax
adc $0, %rcx
adc $0, %rdx
adc $0, %rax
mov %rax, %rdi C 0mod3
shr $48, %rax C 0mod3 high
and %r11, %rdi C 0mod3 low
mov R32(%rcx), R32(%r10) C 1mod3
shr $32, %rcx C 1mod3 high
add %rdi, %rax C apply 0mod3 low
movzwl %dx, R32(%rdi) C 2mod3
shl $16, %r10 C 1mod3 low
add %rcx, %rax C apply 1mod3 high
shr $16, %rdx C 2mod3 high
add %r10, %rax C apply 1mod3 low
shl $32, %rdi C 2mod3 low
add %rdx, %rax C apply 2mod3 high
add %rdi, %rax C apply 2mod3 low
FUNC_EXIT()
ret
EPILOGUE()
More information about the gmp-bugs
mailing list