Assembly routines break Windows 64-bit SEH

Thu May 2 12:16:46 UTC 2019

Hello,

Attached is a sample program which deliberately induces a segmentation
fault in some of GMP's assembly code; in this case mpn_divrem_1 since
that happens to be where I first discovered the problem, though it
could affect most any of them.  I am working on 64-bit Cygwin, but
this problem can affect any code on Windows (including Cygwin, MinGW,
native Windows) that uses GMP.  Though the problem is particularly
severe on Cygwin, which relies on structured exception handling to
catch exceptions and convert them to POSIX signals and/or return
proper error codes.

When working correctly, compiling and running the attached program
should work as follows:

$ gcc test.c -lgmp
$ ./a.exe; echo $?
Segmentation fault (core dumped)
139

Instead this is what happens currently:

$ ./a.exe; echo $?
0

No output, and the process exit code is zero (this latter effect is
kind of an unfortunate problem in Cygwin that if a process terminates
due to an exception where Cygwin's exception handler was somehow not
able to run, the exit code returned defaults to zero).

This is because the assembly routines do not include the metadata that
is necessary on 64-bit Windows [1] for stack unwinding to work
properly during exception handling.  This was brought up once before
on this list many years ago [2] but AFAICT nothing has ever been done
about it.

I was able to confirm that this was the issue by manually editing the
assembly for mpn_divrem_1 and recompiling/linking.  I modified the
function prologue to look like:

        .seh_proc __gmpn_divrem_1_x86_64
__gmpn_divrem_1_x86_64:

        push    %rdi
        .seh_pushreg    %rdi
        push    %rsi
        .seh_pushreg    %rsi
        mov     %rcx, %rdi
        mov     %rdx, %rsi
        mov     %r8, %rdx
        mov     %r9, %rcx

        mov     56(%rsp), %r8
        xor     %eax, %eax
        push    %r13
        .seh_pushreg    %r13
        push    %r12
        .seh_pushreg    %r12
        push    %rbp
        .seh_pushreg    %rbp
        push    %rbx
        .seh_pushreg    %rbx
        .seh_endprologue

At the end of the function just a .seh_endproc is needed as well.

Technically this is still not quite correct because the procedure does
later modify RSP in preparation for a `call    __gmpn_invert_limb`.
Accordingly, this requires establishment of a frame pointer during the
prologue, or otherwise if an exception occurred in __gmpn_invert_limb
stack unwinding would still fail.  In my test case this does not
happen so it still works--but I should still note this subtlety.

I believe this is possible to fix in general, and would be happy to
work on a patch if it would be accepted in principle.

Many of the existing m4 macros in the assembly routines (such as
PROLOGUE and EPILOGUE) can be modified for x64 to add the necessary
bits.  It would probably be good also to replace some of the explicit
`push <reg>` instructions in the prologues with some new macros
similar to those provided by MASM [3].  For example a
`push_reg(<reg>)` macro would emit (on Windows 64):

    push <reg>
    .seh_pushreg <reg>

whereas on all other platforms it would just emit the plain `push
<reg>` instruction.  This part I believe is easy.

The trickiest part is just ensuring that some register is available to
establish a frame pointer, when necessary (it doesn't necessarily have
to be RBP; any nonvolatile register will do).  In the case of
mpn_divrem_1 I can see that %r14 is available, but I will have to go
through all the routines one by one and work this out.

So, should I work on this?

Thanks,
Madison

[1] https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=vs-2019
[2] https://gmplib.org/list-archives/gmp-bugs/2008-March/000951.html
[3] https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=vs-2019#masm-macros
-------------- next part --------------
#include <gmp.h>
#include <stdlib.h>

int main(void) {
    mp_limb_t *rlp;
    mp_size_t qxn = 0;
    mp_limb_t s2p[1] = {0};
    mp_size_t s2n = 1;
    mp_limb_t s3limb = 1;

    /* Use of this function in particular is arbitrary aside from
     * the fact that it is known to demonstrate the problem on my
     * system (my system uses an assembly implementation for it)
     */

    /* Just set to something that will segfault when accessed */
    rlp = (mp_limb_t*)0x1234;
    mpn_divrem_1(rlp, qxn, s2p, s2n, s3limb);
    return 1;
}
-------------- next part --------------

	.text
	.align	16, 0x90
	.globl	__gmpn_preinv_divrem_1_x86_64

        .seh_proc __gmpn_preinv_divrem_1_x86_64
__gmpn_preinv_divrem_1_x86_64:

	push	%rdi
        .seh_pushreg    %rdi
	push	%rsi
        .seh_pushreg    %rsi
	mov	%rcx, %rdi
	mov	%rdx, %rsi
	mov	%r8, %rdx
	mov	%r9, %rcx

	mov	56(%rsp), %r8	
	mov	64(%rsp), %r9	
	xor	%eax, %eax
	push	%r13
	push	%r12
	push	%rbp
	push	%rbx

	mov	%rsi, %r12
	mov	%rcx, %rbx
	add	%rsi, %rcx
	mov	%rdx, %rsi

	lea	-8(%rdi,%rcx,8), %rdi

	test	%r8, %r8
	js	Lnent

	mov	104(%rsp), %cl
	shl	%cl, %r8
	jmp	Luent
.seh_endproc

	.align	16, 0x90
	.globl	__gmpn_divrem_1_x86_64

        .seh_proc __gmpn_divrem_1_x86_64
__gmpn_divrem_1_x86_64:

	push	%rdi
        .seh_pushreg    %rdi
	push	%rsi
        .seh_pushreg    %rsi
	mov	%rcx, %rdi
	mov	%rdx, %rsi
	mov	%r8, %rdx
	mov	%r9, %rcx

	mov	56(%rsp), %r8	
	xor	%eax, %eax
	push	%r13
        .seh_pushreg    %r13
	push	%r12
        .seh_pushreg    %r12
	push	%rbp
        .seh_pushreg    %rbp
	push	%rbx
        .seh_pushreg    %rbx
	.seh_endprologue

	mov	%rsi, %r12
	mov	%rcx, %rbx
	add	%rsi, %rcx
	mov	%rdx, %rsi
	je	Lret

	lea	-8(%rdi,%rcx,8), %rdi
	xor	%ebp, %ebp

	test	%r8, %r8
	jns	Lunnormalized

Lnormalized:
	test	%rbx, %rbx
	je	L8			
	mov	-8(%rsi,%rbx,8), %rbp
	dec	%rbx
	mov	%rbp, %rax
	sub	%r8, %rbp
	cmovc	%rax, %rbp
	sbb	%eax, %eax
	inc	%eax
	mov	%rax, (%rdi)
	lea	-8(%rdi), %rdi
L8:

	push	%r8

	sub	$32, %rsp	
	mov	%r8, %rcx		

	call	__gmpn_invert_limb
	add	$32, %rsp	
	pop	%r8

	mov	%rax, %r9
	mov	%rbp, %rax
	jmp	Lnent

	.align	16, 0x90
Lntop:mov	(%rsi,%rbx,8), %r10		
	mul	%r9			
	add	%r10, %rax		
	adc	%rbp, %rdx		
	mov	%rax, %rbp		
	mov	%rdx, %r13		
	imul	%r8, %rdx			
	sub	%rdx, %r10		
	mov	%r8, %rax			
	add	%r10, %rax		
	cmp	%rbp, %r10		
	cmovc	%r10, %rax		
	adc	$-1, %r13		
	cmp	%r8, %rax			
	jae	Lnfx			
Lnok:	mov	%r13, (%rdi)		
	sub	$8, %rdi			
Lnent:lea	1(%rax), %rbp		
	dec	%rbx			
	jns	Lntop			

	xor	%ecx, %ecx
	jmp	Lfrac

Lnfx:	sub	%r8, %rax
	inc	%r13
	jmp	Lnok

Lunnormalized:
	test	%rbx, %rbx
	je	L44
	mov	-8(%rsi,%rbx,8), %rax
	cmp	%r8, %rax
	jae	L44
	mov	%rbp, (%rdi)
	mov	%rax, %rbp
	lea	-8(%rdi), %rdi
	je	Lret
	dec	%rbx
L44:
	bsr	%r8, %rcx
	not	%ecx
	shl	%cl, %r8
	shl	%cl, %rbp

	push	%rcx

	push	%r8

	sub	$40, %rsp	
	mov	%r8, %rcx		

	call	__gmpn_invert_limb

	add	$40, %rsp	
	pop	%r8

	pop	%rcx

	mov	%rax, %r9
	mov	%rbp, %rax
	test	%rbx, %rbx
	je	Lfrac

Luent:dec	%rbx
	mov	(%rsi,%rbx,8), %rbp
	neg	%ecx
	shr	%cl, %rbp
	neg	%ecx
	or	%rbp, %rax
	jmp	Lent

	.align	16, 0x90
Lutop:mov	(%rsi,%rbx,8), %r10
	shl	%cl, %rbp
	neg	%ecx
	shr	%cl, %r10
	neg	%ecx
	or	%r10, %rbp
	mul	%r9
	add	%rbp, %rax
	adc	%r11, %rdx
	mov	%rax, %r11
	mov	%rdx, %r13
	imul	%r8, %rdx
	sub	%rdx, %rbp
	mov	%r8, %rax
	add	%rbp, %rax
	cmp	%r11, %rbp
	cmovc	%rbp, %rax
	adc	$-1, %r13
	cmp	%r8, %rax
	jae	Lufx
Luok:	mov	%r13, (%rdi)
	sub	$8, %rdi
Lent:	mov	(%rsi,%rbx,8), %rbp
	dec	%rbx
	lea	1(%rax), %r11
	jns	Lutop

Luend:shl	%cl, %rbp
	mul	%r9
	add	%rbp, %rax
	adc	%r11, %rdx
	mov	%rax, %r11
	mov	%rdx, %r13
	imul	%r8, %rdx
	sub	%rdx, %rbp
	mov	%r8, %rax
	add	%rbp, %rax
	cmp	%r11, %rbp
	cmovc	%rbp, %rax
	adc	$-1, %r13
	cmp	%r8, %rax
	jae	Lefx
Leok:	mov	%r13, (%rdi)
	sub	$8, %rdi
	jmp	Lfrac

Lufx:	sub	%r8, %rax
	inc	%r13
	jmp	Luok
Lefx:	sub	%r8, %rax
	inc	%r13
	jmp	Leok

Lfrac:mov	%r8, %rbp
	neg	%rbp
	jmp	Lfent

	.align	16, 0x90			
Lftop:mul	%r9			
	add	%r11, %rdx		
	mov	%rax, %r11		
	mov	%rdx, %r13		
	imul	%rbp, %rdx		
	mov	%r8, %rax			
	add	%rdx, %rax		
	cmp	%r11, %rdx		
	cmovc	%rdx, %rax		
	adc	$-1, %r13		
	mov	%r13, (%rdi)		
	sub	$8, %rdi			
Lfent:lea	1(%rax), %r11		
	dec	%r12			
	jns	Lftop			

	shr	%cl, %rax
Lret:	pop	%rbx
	pop	%rbp
	pop	%r12
	pop	%r13
	pop	%rsi
	pop	%rdi
	ret
.seh_endproc