Improvements to powerpc32 asm code
Mark Rodenkirch
mrodenkirch@wi.rr.com
Sun, 1 Jun 2003 06:28:42 -0500
--Apple-Mail-9-997795106
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
charset=US-ASCII;
format=flowed
On Sunday, June 1, 2003, at 05:00 AM, Torbjorn Granlund wrote:
> For which powerpc model did you get these timing results?
> My code performs very similarly on G3 and the old G4, with
> a slight advantage for your code for larger operands.
My tests were done on a 7400. I don't have any other CPUs to test it
on. I have attached my version of the code (below) if you are
interested in comparing it to the new version. I it quite possible
that one version works better on the G4, while the other works better
on the 604e.
> C cycles/limb
> C 603e: ?
> C 604e: 3.25
> C 75x (G3): 3.5
> C 7400,7410 (G4): 3.5
> C 744x,745x (G4+): 4.25
>
> To test the changes, I am testing adds and subtracts on values
> from 1 to 30 limbs for base 2 and base 10 numbers. If there is
> a better means to testing, I would like to know.
>
> The best program to use is probably gmp/tests/devel/try.c.
Thanks. I'll do that.
BTW, I am also looking at improving addmul_1.asm. On my G4 I have it
at a little over 7 cycles/limb (modeled after the powerpc64 routine),
which is better than the current 8.5 cycles/limb. It contains a bug,
so when I work that bug out, I might lose the added efficiency. If
anyone has already done that or has done it better, then I will stop
working on it.
--Mark
--Apple-Mail-9-997795106
Content-Disposition: attachment;
filename=add_n_new.asm
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
x-unix-mode=0644;
name="add_n_new.asm"
dnl PowerPC 750 mpn_add_n -- add mpn limb vectors.
dnl Copyright 2002 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
dnl Suite 330, Boston, MA 02111-1307, USA.
include(`../config.m4')
C cycles/limb
C 604e: 4.0
C 750: 4.0
C mp_limb_t mpn_add_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size);
C
C The use of offsets xp-wp and yp-wp is necessary for 4.0 c/l on 750.
ASM_START()
PROLOGUE(mpn_add_n)
C r3 wp
C r4 xp
C r5 yp
C r6 size
cmpi cr0, r6, 0x3
bgt cr0, L(start) C branch if size > 3
lwz r8, 0(r4)
lwz r9, 0(r5)
addc r8, r8, r9
stw r8, 0(r3)
cmpi cr0, r6, 0x1
beq cr0, L(done) C branch if size = 1
lwz r8, 4(r4)
lwz r9, 4(r5)
adde r8, r8, r9
stw r8, 4(r3)
cmpi cr0, r6, 0x2
beq cr0, L(done) C branch if size = 2
lwz r8, 8(r4)
lwz r9, 8(r5)
adde r8, r8, r9
stw r8, 8(r3)
b L(done)
L(start):
andi. r12, r6, 0x3
sub r4, r4, r3 C xp = xp - wp
sub r5, r5, r3 C yp = xp - wp
subi r3, r3, 4 C wp = wp - 4
srwi r6, r6, 0x2 C size = size / 4
mtctr r6
addi r4, r4, 4
addi r5, r5, 4
addic r2, r4, 4
L(loop):
C r3 wp, incrementing
C r4 xp
C r5 yp
C r8 xp[i]
C r9 xp[i]
lwzx r8, r4, r3 C r8 = xp[i]
lwzx r9, r5, r3 C r9 = yp[i]
adde r8, r8, r9 C r8 = wp[i] = xp[i] + yp[i]
lwzx r9, r2, r3 C r9 = xp[i+1]
stwu r8, 4(r3) C r3 = r3 + 4, wp[i]
lwzx r8, r5, r3 C r8 = yp[i]
adde r8, r8, r9 C r8 = wp[i] = xp[i] + yp[i]
lwzx r9, r2, r3 C r9 = xp[i+1]
stwu r8, 4(r3) C r3 = r3 + 4, wp[i]
lwzx r8, r5, r3 C r8 = yp[i]
adde r8, r8, r9 C r8 = wp[i] = xp[i] + yp[i]
lwzx r9, r2, r3 C r9 = xp[i+1]
stwu r8, 4(r3) C r3 = r3 + 4, wp[i]
lwzx r8, r5, r3 C r8 = yp[i]
adde r8, r8, r9 C r8 = wp[i] = xp[i] + yp[i]
stwu r8, 4(r3) C r3 = r3 + 4, wp[i]
bdnz L(loop)
L(last):
cmpi cr0, r12, 0x0
beq cr0, L(done) C branch if no more limbs
mtctr r12
L(rest):
lwzx r6, r4, r3 C xp[i]
lwzx r7, r5, r3 C yp[i]
adde r6, r6, r7
stwu r6, 4(r3) C wp[i]
bdnz L(rest)
L(done):
li r3, 0
addze r3, r3 C carry out
blr
EPILOGUE()
--Apple-Mail-9-997795106
Content-Disposition: attachment;
filename=sub_n_new.asm
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
x-unix-mode=0644;
name="sub_n_new.asm"
dnl PowerPC 750 mpn_sub_n -- subtract limb vectors.
dnl Copyright 2002 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
dnl Suite 330, Boston, MA 02111-1307, USA.
include(`../config.m4')
C cycles/limb
C 604e: 4.0
C 750: 4.0
C mp_limb_t mpn_sub_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C mp_size_t size);
C
C Same style as mpn_add_n.
ASM_START()
PROLOGUE(mpn_sub_n)
C r3 wp
C r4 xp
C r5 yp
C r6 size
cmpi cr0, r6, 0x3
bgt cr0, L(start) C branch if size > 3
lwz r8, 0(r4)
lwz r9, 0(r5)
subfc r8, r9, r8
stw r8, 0(r3)
cmpi cr0, r6, 0x1
beq cr0, L(done) C branch if size = 1
lwz r8, 4(r4)
lwz r9, 4(r5)
subfe r8, r9, r8
stw r8, 4(r3)
cmpi cr0, r6, 0x2
beq cr0, L(done) C branch if size = 2
lwz r8, 8(r4)
lwz r9, 8(r5)
subfe r8, r9, r8
stw r8, 8(r3)
b L(done)
L(start):
andi. r12, r6, 0x3
sub r4, r4, r3 C xp = xp - wp
sub r5, r5, r3 C yp = xp - wp
subi r3, r3, 4 C wp = wp - 4
srwi r6, r6, 0x2 C size = size / 4
mtctr r6
addi r4, r4, 4
addi r5, r5, 4
addi r2, r4, 4
lwzx r8, r4, r3 C r8 = xp[i]
lwzx r9, r5, r3 C r9 = yp[i]
subfc r8, r9, r8 C r8 = wp[i] = xp[i] - yp[i]
b L(inner)
L(loop):
C r3 wp, incrementing
C r4 xp-wp
C r5 yp-wp
C r8 xp[i]
C r9 xp[i]
lwzx r8, r4, r3 C r8 = xp[i]
lwzx r9, r5, r3 C r9 = yp[i]
subfe r8, r9, r8 C r8 = wp[i] = xp[i] - yp[i]
L(inner):
lwzx r9, r2, r3 C r9 = xp[i+1]
stwu r8, 4(r3) C r3 = r3 + 4, wp[i]
lwzx r8, r5, r3 C r8 = yp[i]
subfe r8, r8, r9 C r8 = wp[i] = xp[i] - yp[i]
lwzx r9, r2, r3 C r9 = xp[i+1]
stwu r8, 4(r3) C r3 = r3 + 4, wp[i]
lwzx r8, r5, r3 C r8 = yp[i]
subfe r8, r8, r9 C r8 = wp[i] = xp[i] - yp[i]
lwzx r9, r2, r3 C r9 = xp[i+1]
stwu r8, 4(r3) C r3 = r3 + 4, wp[i]
lwzx r8, r5, r3 C r8 = yp[i]
subfe r8, r8, r9 C r8 = wp[i] = xp[i] - yp[i]
stwu r8, 4(r3) C r3 = r3 + 4, wp[i]
bdnz L(loop)
L(last):
cmpi cr0, r12, 0x0
beq cr0, L(done) C branch if no more limbs
mtctr r12
L(rest):
lwzx r8, r4, r3 C xp[i]
lwzx r9, r5, r3 C yp[i]
subfe r8, r9, r8
stwu r8, 4(r3) C wp[i]
bdnz L(rest)
L(done):
subfe r3, r0, r0 C 0 or -1
subfic r3, r3, 0 C 0 or 1
blr
EPILOGUE()
--Apple-Mail-9-997795106--