Athlon XP Single limb multiply

Thu Dec 18 13:04:27 CET 2003

This code is in MASM syntax and always runs at ~3 cycles a limb.  The only
improvement is to reduce the code size and unroll to process 32 limbs at
once.  Here is loop code that gets unrolled:

               mul     ebp

               add     ebx, eax

               mov     eax, [esi][i]

               mov     [esi][i][-4], ebx

               mov     ebx, ecx

               adc     ebx, edx

The following is the whole routine I use:

        OPTION PROLOGUE:NONE

        OPTION EPILOGUE:NONE

        ALIGN _CODE_ALIGNMENT_

; loop range from 2 thru 62

;32: 3.0700 cycles per limb (DWORD)

mpn_mul_1__k7_04_UNROLL = 32

mpn_mul_1__k7_04 PROC USES ebx esi edi, bInt:PTR BIGINT, limb:DWORD,
carry:DWORD

        pBIGINT EQU <[esp+4*(4+1)]>

        _limb   EQU <[esp+4*(4+2)]>

        _carry  EQU <[esp+4*(4+3)]>

        push    ebp

        push    ebx

        push    esi

        push    edi

        mov     esi, pBIGINT

IF 32 EQ mpn_mul_1__k7_04_UNROLL

        mov     ecx, mpn_mul_1__k7_04_UNROLL

        mov     edi, [esi].BIGINT.limbs - SIZEOF BIGINT

        mov     edx, edi

        shr     edi, 5         ; divide by 32

        and     edx, 1Fh       ; remainder

ELSE

        mov     ecx, mpn_mul_1__k7_04_UNROLL

        mov     eax, [esi].BIGINT.limbs - SIZEOF BIGINT

        cdq

        div     ecx

        mov     edi, eax

ENDIF

        imul    ecx, edx, -14          ; mpn_mul_1__k7_04_LIMB_CODE

        mov     eax, [esi]             ; [-4*mpn_mul_1__k7_03_UNROLL]

        mov     ebp, _limb             ; multiplier

        mov     ebx, _carry            ; carry

        ; adjust ESI to point at correct block of data to allow greater

        ; unroll range while still having only a bytes offset:

        ;       each block can be a maximum of 128 bytes

        ;

        ; ESI points at BIGINT[0], but needs to point at BIGINT[80h -
(BIGINT.limbs MOD UNROLL) + UNROLL]

        ;

; ESI = ESI + 4*(UNROLL - (BIGINT.limbs MOD UNROLL))

        sub     edx, 31

        shl     edx, 2

        add     esi, edx

        ; jump to complete partial roll first

        lea     edx, [ecx][_2]

        xor     ecx, ecx

        jmp     edx

        ALIGN 16

_0:

        i=80h - 4*mpn_mul_1__k7_04_UNROLL

        WHILE i LT 80h

               mul     ebp

               add     ebx, eax

               IF i EQ 0

                       BYTE 8Bh, 46h, 00h

                       mov     [esi][i][-4], ebx

               ELSEIF i EQ 4

                       mov     eax, [esi][i]

                       BYTE 89h, 5Eh, 00h

               ELSEIF i GT -80h

                       mov     eax, [esi][i]

                       mov     [esi][i][-4], ebx

               ELSE ; only byte offsets

                       .err mpn_mul_1__k7_04_UNROLL too big!

               ENDIF

               mov     ebx, ecx

               adc     ebx, edx

        i=i+4

        ENDM

_2:     dec     edi

        lea     esi, [esi + 4*mpn_mul_1__k7_04_UNROLL]

        jns     _0

        mov     eax, ebx ; return carry

        pop     edi

        pop     esi

        pop     ebx

        pop     ebp

        retn    4*3

mpn_mul_1__k7_04 ENDP

* mailto:bowersrn at osd.pentagon.mil <mailto:bowersrn at osd.pentagon.mil> 

* 831.583.2500 Ex.5339

-------------- next part --------------
An HTML attachment was scrubbed...
URL: /list-archives/gmp-devel/attachments/20031218/c4d9b492/attachment-0001.htm