Improvements to powerpc32 asm code

Mark Rodenkirch mrodenkirch@wi.rr.com
Sun, 1 Jun 2003 06:28:42 -0500


--Apple-Mail-9-997795106
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=US-ASCII;
	format=flowed


On Sunday, June 1, 2003, at 05:00 AM, Torbjorn Granlund wrote:

> For which powerpc model did you get these timing results?
> My code performs very similarly on G3 and the old G4, with
> a slight advantage for your code for larger operands.

My tests were done on a 7400.  I don't have any other CPUs to test it 
on.  I have attached my version of the code (below) if you are 
interested in comparing it to the new version.  I it quite possible 
that one version works better on the G4, while the other works better 
on the 604e.

> C                cycles/limb
> C 603e:            ?
> C 604e:            3.25
> C 75x (G3):        3.5
> C 7400,7410 (G4):  3.5
> C 744x,745x (G4+): 4.25
>
>   To test the changes, I am testing adds and subtracts on values
>   from 1 to 30 limbs for base 2 and base 10 numbers.  If there is
>   a better means to testing, I would like to know.
>
> The best program to use is probably gmp/tests/devel/try.c.

Thanks.  I'll do that.

BTW, I am also looking at improving addmul_1.asm.  On my G4 I have it 
at a little over 7 cycles/limb (modeled after the powerpc64 routine), 
which is better than the current 8.5 cycles/limb.  It contains a bug, 
so when I work that bug out, I might lose the added efficiency.  If 
anyone has already done that or has done it better, then I will stop 
working on it.

--Mark


--Apple-Mail-9-997795106
Content-Disposition: attachment;
	filename=add_n_new.asm
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0644;
	name="add_n_new.asm"

dnl  PowerPC 750 mpn_add_n -- add mpn limb vectors.

dnl  Copyright 2002 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 2.1 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public
dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
dnl  Suite 330, Boston, MA 02111-1307, USA.

include(`../config.m4')


C       cycles/limb
C 604e:     4.0
C 750:      4.0

C mp_limb_t mpn_add_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C                      mp_size_t size);
C
C The use of offsets xp-wp and yp-wp is necessary for 4.0 c/l on 750.

ASM_START()
PROLOGUE(mpn_add_n)

	C r3	wp
	C r4	xp
	C r5	yp
	C r6	size

	cmpi	cr0, r6, 0x3
	bgt     cr0, L(start)	C branch if size > 3

	lwz	r8, 0(r4)
	lwz	r9, 0(r5)
	addc	r8, r8, r9
	stw	r8, 0(r3)

	cmpi	cr0, r6, 0x1
	beq     cr0, L(done) 	C branch if size = 1

	lwz	r8, 4(r4)
	lwz	r9, 4(r5)
	adde	r8, r8, r9
	stw	r8, 4(r3)

	cmpi	cr0, r6, 0x2
	beq     cr0, L(done) 	C branch if size = 2

	lwz	r8, 8(r4)
	lwz	r9, 8(r5)
	adde	r8, r8, r9
	stw	r8, 8(r3)
	b	L(done)

L(start):
	andi.   r12, r6, 0x3

        sub     r4, r4, r3      C xp = xp - wp
        sub     r5, r5, r3      C yp = xp - wp

        subi    r3, r3, 4       C wp = wp - 4

        srwi    r6, r6, 0x2     C size = size / 4
        mtctr   r6

        addi    r4, r4, 4
        addi    r5, r5, 4
        addic   r2, r4, 4

L(loop):
	C r3	wp, incrementing
	C r4	xp
	C r5	yp
        C r8    xp[i]
        C r9	xp[i]

        lwzx    r8, r4, r3      C r8 = xp[i]
        lwzx    r9, r5, r3      C r9 = yp[i]
        adde    r8, r8, r9      C r8 = wp[i] = xp[i] + yp[i]

        lwzx    r9, r2, r3      C r9 = xp[i+1]
        stwu    r8, 4(r3)       C r3 = r3 + 4, wp[i]
        lwzx    r8, r5, r3      C r8 = yp[i]

        adde    r8, r8, r9      C r8 = wp[i] = xp[i] + yp[i]

        lwzx    r9, r2, r3      C r9 = xp[i+1]
        stwu    r8, 4(r3)       C r3 = r3 + 4, wp[i]
        lwzx    r8, r5, r3      C r8 = yp[i]

        adde    r8, r8, r9      C r8 = wp[i] = xp[i] + yp[i]

        lwzx    r9, r2, r3      C r9 = xp[i+1]
        stwu    r8, 4(r3)       C r3 = r3 + 4, wp[i]
        lwzx    r8, r5, r3      C r8 = yp[i]

        adde    r8, r8, r9      C r8 = wp[i] = xp[i] + yp[i]

        stwu    r8, 4(r3)       C r3 = r3 + 4, wp[i]
        bdnz    L(loop)

L(last):
        cmpi    cr0, r12, 0x0
        beq     cr0, L(done)    C branch if no more limbs

        mtctr   r12

L(rest):
        lwzx    r6, r4, r3      C xp[i]
        lwzx    r7, r5, r3      C yp[i]

        adde    r6, r6, r7
        stwu    r6, 4(r3)       C wp[i]
        bdnz    L(rest)

L(done):
	li	r3, 0

	addze	r3, r3		C carry out
	blr

EPILOGUE()

--Apple-Mail-9-997795106
Content-Disposition: attachment;
	filename=sub_n_new.asm
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0644;
	name="sub_n_new.asm"

dnl  PowerPC 750 mpn_sub_n -- subtract limb vectors.

dnl  Copyright 2002 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 2.1 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public
dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
dnl  Suite 330, Boston, MA 02111-1307, USA.

include(`../config.m4')


C       cycles/limb
C 604e:     4.0
C 750:      4.0

C mp_limb_t mpn_sub_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C                      mp_size_t size);
C
C Same style as mpn_add_n.

ASM_START()
PROLOGUE(mpn_sub_n)

	C r3	wp
	C r4	xp
	C r5	yp
	C r6	size

	cmpi	cr0, r6, 0x3
	bgt     cr0, L(start)	C branch if size > 3

	lwz	r8, 0(r4)
	lwz	r9, 0(r5)
	subfc	r8, r9, r8
	stw	r8, 0(r3)

	cmpi	cr0, r6, 0x1
	beq     cr0, L(done) 	C branch if size = 1

	lwz	r8, 4(r4)
	lwz	r9, 4(r5)
	subfe	r8, r9, r8
	stw	r8, 4(r3)

	cmpi	cr0, r6, 0x2
	beq     cr0, L(done) 	C branch if size = 2

	lwz	r8, 8(r4)
	lwz	r9, 8(r5)
	subfe	r8, r9, r8
	stw	r8, 8(r3)
	b	L(done)

L(start):
	andi.   r12, r6, 0x3

	sub	r4, r4, r3	C xp = xp - wp
	sub	r5, r5, r3	C yp = xp - wp

	subi	r3, r3, 4	C wp = wp - 4

	srwi    r6, r6, 0x2	C size = size / 4
	mtctr	r6

        addi    r4, r4, 4
        addi    r5, r5, 4
        addi    r2, r4, 4

	lwzx	r8, r4, r3	C r8 = xp[i]
	lwzx	r9, r5, r3	C r9 = yp[i]
 	subfc	r8, r9, r8	C r8 = wp[i] = xp[i] - yp[i]
	b	L(inner)

L(loop):
	C r3	wp, incrementing
	C r4	xp-wp
	C r5	yp-wp
        C r8    xp[i]
        C r9	xp[i]

	lwzx	r8, r4, r3	C r8 = xp[i]
	lwzx	r9, r5, r3	C r9 = yp[i]
 	subfe	r8, r9, r8	C r8 = wp[i] = xp[i] - yp[i]

L(inner):
 	lwzx	r9, r2, r3	C r9 = xp[i+1]
 	stwu	r8, 4(r3)	C r3 = r3 + 4, wp[i]
 	lwzx	r8, r5, r3	C r8 = yp[i]

 	subfe	r8, r8, r9	C r8 = wp[i] = xp[i] - yp[i]

 	lwzx	r9, r2, r3	C r9 = xp[i+1]
 	stwu	r8, 4(r3)	C r3 = r3 + 4, wp[i]
 	lwzx	r8, r5, r3	C r8 = yp[i]

 	subfe	r8, r8, r9	C r8 = wp[i] = xp[i] - yp[i]

 	lwzx	r9, r2, r3	C r9 = xp[i+1]
 	stwu	r8, 4(r3)	C r3 = r3 + 4, wp[i]
 	lwzx	r8, r5, r3	C r8 = yp[i]

 	subfe	r8, r8, r9	C r8 = wp[i] = xp[i] - yp[i]

 	stwu	r8, 4(r3)	C r3 = r3 + 4, wp[i]
	bdnz	L(loop)

L(last):
	cmpi	cr0, r12, 0x0
	beq     cr0, L(done)	C branch if no more limbs
	
        mtctr   r12

L(rest):
	lwzx	r8, r4, r3	C xp[i]
	lwzx	r9, r5, r3	C yp[i]

	subfe	r8, r9, r8
	stwu	r8, 4(r3)	C wp[i]
        bdnz    L(rest)

L(done):
        subfe   r3, r0, r0      C 0 or -1

        subfic  r3, r3, 0       C 0 or 1
	blr

EPILOGUE()

--Apple-Mail-9-997795106--