Improvements to powerpc32 asm code
Torbjorn Granlund
tege@swox.com
01 Jun 2003 13:55:28 +0200
--=-=-=
Mark Rodenkirch <mrodenkirch@wi.rr.com> writes:
My tests were done on a 7400. I don't have any other CPUs to
test it on. I have attached my version of the code (below) if
you are interested in comparing it to the new version. I it
quite possible that one version works better on the G4, while
the other works better on the 604e.
Indeed. I've tried to make my code run well on all ppc models.
Since 7400/7450 are now the most important powerpcs, we should
not sacrifice performance for them just for the case of avoiding
specialized code.
Unfortunately, the 7400 (aka G4) is less important than the 7450
(aka G4 too). And their pipelines are quite different. The
former has a 2 issue pipeline, while the latter can issue up to 3
instructions per cycle. Latencies for 7450 are longer than for
7400.
All-in-all, the 7400 is much more similar to 750 (aka G3) than
to 7450.
BTW, I am also looking at improving addmul_1.asm. On my G4 I
have it at a little over 7 cycles/limb (modeled after the
powerpc64 routine), which is better than the current 8.5
cycles/limb. It contains a bug, so when I work that bug out, I
might lose the added efficiency. If anyone has already done
that or has done it better, then I will stop working on it.
I have some preliminary code for mpn_addmul_1 which is a tad bit
faster than the code of GMP 4.1.2. It might not work perfectly
yet. Here it is:
--=-=-=
Content-Type: application/octet-stream
Content-Disposition: attachment; filename=addmul_1.asm
dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add
dnl the result to a second limb vector.
dnl Copyright 1995, 1997, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published by
dnl the Free Software Foundation; either version 2.1 of the License, or (at your
dnl option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
dnl MA 02111-1307, USA.
dnl INPUT PARAMETERS
dnl rp r3
dnl up r4
dnl n r5
dnl vl r6
dnl CPU SPEED
dnl 601
dnl 603
dnl 604
dnl 604e 6.25
dnl 630/PWR3
dnl 640/PWR4
dnl 7x0/G3 7.25-12.85
dnl 7400/G4 7.25-12.85
dnl 7450/G4+
dnl While this loop is faster than the checked-in code, it needs more registers
dnl and thus has significant overhead for saving and restoring callee-saves
dnl registers. For the current 604e MUL_KARATSUBA_THRESHOLD, it is
dnl questionable if this code is really useful. Decreasing the number of live
dnl registers is the most important optimization of this code.
include(`../config.m4')
ASM_START()
PROLOGUE(xmpn_addmul_1)
srwi r0,r5,2
mtctr r0
addi r3,r3,-4 C adjust res_ptr, it's offset before it's used
addi r4,r4,-4 C adjust res_ptr, it's offset before it's used
li r12,0 C clear upper product reg
addic r0,r0,0 C clear cy
C Start software pipeline
stmw r26,-24(r1) C save registers we are supposed to preserve
C Software pipelined main loop
lwz r8,4(r4)
lwz r9,8(r4)
mullw r30,r8,r6
mulhwu r0,r8,r6
lwz r8,12(r4)
mullw r10,r9,r6
mulhwu r12,r9,r6
lwz r26,4(r3)
lwzu r9,16(r4)
mullw r11,r8,r6
adde r31,r10,r0
mulhwu r0,r8,r6
lwz r27,8(r3)
addze r12,r12
adde r26,r26,r30
adde r27,r27,r31
stw r26,4(r3)
stwu r27,8(r3)
bdz .Lend1
.Loop: lwz r8,4(r4)
mullw r10,r9,r6
adde r28,r11,r12
mulhwu r12,r9,r6
lwz r5,4(r3)
lwz r9,8(r4)
mullw r11,r8,r6
adde r29,r10,r0
mulhwu r0,r8,r6
lwz r7,8(r3)
lwz r8,12(r4)
mullw r10,r9,r6
adde r30,r11,r12
mulhwu r12,r9,r6
lwz r26,12(r3)
lwzu r9,16(r4)
mullw r11,r8,r6
adde r31,r10,r0
mulhwu r0,r8,r6
lwz r27,16(r3)
addze r12,r12
addc r5,r5,r28
adde r7,r7,r29
adde r26,r26,r30
adde r27,r27,r31
stw r5,4(r3)
stw r7,8(r3)
stw r26,12(r3)
stwu r27,16(r3)
bdnz .Loop
C Finish software pipeline
.Lend1:
mullw r10,r9,r6
adde r28,r11,r12
mulhwu r12,r9,r6
adde r29,r10,r0
addze r12,r12
lwz r5,4(r3)
lwz r7,8(r3)
addc r5,r5,r28
adde r7,r7,r29
stw r5,4(r3)
stw r7,8(r3)
addze r3,r12
lmw r26,-24(r1) C restore registers from stack
blr
blr
EPILOGUE(xmpn_addmul_1)
--=-=-=
(Note that testing potentially buggy assembly routines can cause
problems with the random number generators of the test programs,
since they rely on mpn_addmul_1. I usually prepend an x before
names of new code, xmpn_addmul_1, and set things up so that test
programs (try.c, etc) call xmpn_addmul_1 instead of
mpn_addmul_1. This typically requires passing
-D__gmpn_addmul_1=xmpn_addmul_1 to the compiler.)
If we want to get most out of the current 32-bit (and 64-bit)
powerpc chips, we should use floating-point operations, taking
advantage of the fmadd instruction. Most powerpcs can sustain
about one fmadd per cycle (7400 can bizarrely sustain 3 fmadd
each 4 cycles, 7450 can sustain 4 fmadd each 5 cycles).
It should be possible to get mpn_addmul_1 faster with
floating-point, but the real gain will come by implementing
mpn_addmul_2, mpn_addmul_3, or even mpn_addmul_4. Here is a
beginning for mpn_addmul_4:
--=-=-=
Content-Type: application/octet-stream
Content-Disposition: attachment; filename=addmul_N.asm
include(`../config.m4')
divert(-1)
C For good G3 and G4 multiply speed, we need to base the multiply code on
C fmadd, going to at least mpn_addmul_4.
C
C The 603 can sustain one fmadd every 2nd cycle.
C
C The 604 can sustain one fmadd per cycle.
C
C The 75x can sustain one fmadd every 2nd cycle.
C
C The 7400 and 7410 can sustain 3 fmadd in 4 cycles.
C
C The 744x and 745x family can sustain 4 fmadd in 5 cycles.
C
C We therefore get these mpn_addmul_N performance limits for the various
C processors:
C current speed limit speed limit
C addmul_1 int addmul_N fp addmul_N
C 603 ? ?-11 4
C 604 6.75 4 2
C 75x 8.7-14.3 ?-11 4
C 7400/7410 8.7-14.3 ?-11 2.67
C 744x/745x 9.5 4 2.5
C A major obstacle for fp based mpn_addmul_N is converting the fp product-sums
C to two 32-bit integers. Operands will be limted as follows:
C N 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
C 32 48 49 50 50 51 51 51 51 52 52 52 52 52 52 52 52
C 64 49 50 51 51 52 52 52 52 53 53 53 53 53 53 53 53
C Here is a trick that avoids using more fp operations:
C fadd a, a, 2^52
C stfd a, 0(r1)
C lwz r3, 0(r1)
C lwz r4, 4(r1)
C rlwinm r3,r3,0,0xfffff
C The idea is that adding 2^52 puts the bits we need in fixed positions, so
C that the low 32 bits of the fp number are also the 32 low integer bits.
define(`rp',`r3')
define(`up',`r4')
define(`n',`r5')
define(`vp',`r6')
C Free: r0
define(`iacc000',`r7')
define(`iacc016',`r8')
define(`iacc032',`r9')
define(`iacc048',`r10')
define(`cy000',`r11')
define(`cy032',`r12')
define(`t0',`r31')
define(`iacc064',`r30')
define(`none',`r29')
define(`none',`r28')
define(`none',`r27')
define(`none',`r26')
define(`none',`r25')
define(`none',`r24')
define(`none',`r23')
define(`none',`r22')
define(`none',`r21')
define(`none',`r20')
define(`v000',`f0')
define(`v016',`f1')
define(`v032',`f2')
define(`v048',`f3')
define(`v064',`f4')
define(`v080',`f5')
define(`v096',`f6')
define(`v112',`f7')
define(`acc000',`f8')
define(`acc016',`f9')
define(`acc032',`f10')
define(`acc048',`f11')
define(`acc064',`f12')
define(`acc080',`f13')
define(`acc096',`f31')
define(`acc112',`f30')
define(`two52',`f29')
define(`ft0',`f28')
divert
ifdef(`DARWIN',
`define(`LO',`lo16($1)')
define(`HI',`hi16($1)')',
`define(`LO',`$1@l')
define(`HI',`$1@ha')'
)
C DARWIN PIC:
C mflr tmpreg
C bcl 20,31,here
C here: mflr basereg
C mtlr tmpreg
C addis xreg, basereg, ha16(caddr-here)
C LDST reg, lo16(caddr-here)(xreg)
C AIX:
C LDST reg, caddr(2)
C ELF non-PIC:
C lis xreg, caddr@ha
C LDST reg, caddr@l(xreg)
C ELF PIC:
C .text
C caddr:.long CONSTANT
C mflr tmpreg
C bcl 20,31,here
C here: mflr basereg
C mtlr tmpreg
C LDST reg, caddr-here(basereg)
ASM_START()
.data
ALIGN(8)
.L0: .long 0x43300000, 0x00000000
.text
PROLOGUE(mpn_addmul_4)
stfd f31,-128(r1)
stfd f30,-120(r1)
stfd f29,-112(r1)
stfd f28,-104(r1)
stw r24,-64(r1)
stw r25,-60(r1)
stw r26,-56(r1)
stw r27,-52(r1)
stw r28,-48(r1)
stw r29,-44(r1)
stw r30,-40(r1)
stw r31,-36(r1)
stwu r1,-144(r1)
mtctr n
lis r0, 0x4330 C Keep upper 32 bits of the ...
stw r0, -8(r1) C ... double 2^52 at -8(r1)
lis r5, HI(.L0)
lfd two52, LO(.L0)(r5) C Keep 2^52 in two52
lfd acc032, LO(.L0)(r5) C acc032 = 2^52
lfd acc048, LO(.L0)(r5) C acc048 = 2^52
lfd acc064, LO(.L0)(r5) C acc064 = 2^52
lfd acc080, LO(.L0)(r5) C acc080 = 2^52
lfd acc096, LO(.L0)(r5) C acc096 = 2^52
lfd acc112, LO(.L0)(r5) C acc112 = 2^52
C Split v operand into 16 bits chunks, and convert the pieces to 8 IEEE doubles
lwz r7, 0(vp)
rlwinm t0,r7,0,16,31 C v bits 0..15
stw t0, -4(r1)
lfd ft0, -8(r1)
fsub v000, ft0, two52
rlwinm t0,r7,16,16,31 C v bits 16..31
stw t0, -4(r1)
lfd ft0, -8(r1)
fsub v016, ft0, two52
lwz r7, 4(vp)
rlwinm t0,r7,0,16,31 C v bits 32..47
stw t0, -4(r1)
lfd ft0, -8(r1)
fsub v032, ft0, two52
rlwinm t0,r7,16,16,31 C v bits 48..63
stw t0, -4(r1)
lfd ft0, -8(r1)
fsub v048, ft0, two52
lwz r7, 8(vp)
rlwinm t0,r7,0,16,31 C v bits 64..79
stw t0, -4(r1)
lfd ft0, -8(r1)
fsub v064, ft0, two52
rlwinm t0,r7,16,16,31 C v bits 80..95
stw t0, -4(r1)
lfd ft0, -8(r1)
fsub v080, ft0, two52
lwz r7, 12(vp)
rlwinm t0,r7,0,16,31 C v bits 96..111
stw t0, -4(r1)
lfd ft0, -8(r1)
fsub v096, ft0, two52
rlwinm t0,r7,16,16,31 C v bits 112..127
stw t0, -4(r1)
lfd ft0, -8(r1)
fsub v112, ft0, two52
addi up, up, -4
addi rp, rp, -4
li cy000, 0
li cy032, 0
C IDEA: Could combine "rlwinm iacc048" into the "srwi iacc064".
C MAIN LOOP
Loop:
lwzu r0, 4(up)
stw r0, -4(r1)
lfd ft0, -8(r1)
fsub ft0, ft0, two52
fmadd acc000, ft0, v000, acc032
fmadd acc016, ft0, v016, acc048
fmadd acc032, ft0, v032, acc064
fmadd acc048, ft0, v048, acc080
fmadd acc064, ft0, v064, acc096
fmadd acc080, ft0, v080, acc112
fmadd acc096, ft0, v096, two52 C two52 = 2^52
fmadd acc112, ft0, v112, two52 C two52 = 2^52
C 64 32 0
stfd acc000, -32(r1) C |00_____|_______|
stfd acc016, -24(r1) C |00_____|_______|
lwz iacc032, -32(r1) C |00_____|
lwz iacc000, -28(r1) C |_______|
lwz iacc048, -24(r1) C |00_____|
lwz iacc016, -20(r1) C |_______|
rlwinm iacc032, iacc032, 0, 0xfffff
rlwinm iacc048, iacc048, 0, 0xfffff
C iacc032,iacc048 < 2^18
slwi t0, iacc016, 16
addc iacc000, iacc000, t0
srwi t0, iacc016, 16 C t0 < 2^16
adde iacc032, iacc032, t0 C iacc032 < (2^18 + 2^16)
slwi t0, iacc048, 16
addc iacc032, iacc032, t0
srwi iacc064, iacc048, 16 C iacc064 < 2^2
addze iacc064, iacc064
lwz t0, 4(rp)
addc iacc000, iacc000, t0
addze iacc032, iacc032
addze iacc064, iacc064
addc iacc000, iacc000, cy000
stwu iacc000, 4(rp)
adde cy000, iacc032, cy032
addze cy032, iacc064
bdnz Loop
C END MAIN LOOP
C Collect acc032, acc048, acc064, acc080, acc096, acc112, cy000, and cy032
C and store them at rp[4], rp[8], rp[12], plus the return value.
stfd acc032, -32(r1) C |00_____|_______|
stfd acc048, -24(r1) C |00_____|_______|
lwz iacc032, -32(r1) C |00_____|
lwz iacc000, -28(r1) C |_______|
lwz iacc048, -24(r1) C |00_____|
lwz iacc016, -20(r1) C |_______|
rlwinm iacc032, iacc032, 0, 0xfffff
rlwinm iacc048, iacc048, 0, 0xfffff
slwi t0, iacc016, 16
addc iacc000, iacc000, t0
srwi t0, iacc016, 16 C t0 < 2^16
adde iacc032, iacc032, t0 C iacc032 < (2^18 + 2^16)
slwi t0, iacc048, 16
addc iacc032, iacc032, t0
srwi iacc064, iacc048, 16 C iacc064 < 2^2 ???
addze iacc064, iacc064
addc iacc000, iacc000, cy000
stwu iacc000, 4(rp)
adde cy000, iacc032, cy032
addze cy032, iacc064
stfd acc064, -32(r1) C |00_____|_______|
stfd acc080, -24(r1) C |00_____|_______|
lwz iacc032, -32(r1) C |00_____|
lwz iacc000, -28(r1) C |_______|
lwz iacc048, -24(r1) C |00_____|
lwz iacc016, -20(r1) C |_______|
rlwinm iacc032, iacc032, 0, 0xfffff
rlwinm iacc048, iacc048, 0, 0xfffff
slwi t0, iacc016, 16
addc iacc000, iacc000, t0
srwi t0, iacc016, 16 C t0 < 2^16
adde iacc032, iacc032, t0 C iacc032 < (2^18 + 2^16)
slwi t0, iacc048, 16
addc iacc032, iacc032, t0
srwi iacc064, iacc048, 16 C iacc064 < 2^2 ???
addze iacc064, iacc064
addc iacc000, iacc000, cy000
stwu iacc000, 4(rp)
adde cy000, iacc032, cy032
addze cy032, iacc064
stfd acc096, -32(r1) C |00_____|_______|
stfd acc112, -24(r1) C |00_____|_______|
lwz iacc032, -32(r1) C |00_____|
lwz iacc000, -28(r1) C |_______|
lwz iacc048, -24(r1) C |00_____|
lwz iacc016, -20(r1) C |_______|
rlwinm iacc032, iacc032, 0, 0xfffff
rlwinm iacc048, iacc048, 0, 0xfffff
slwi t0, iacc016, 16
addc iacc000, iacc000, t0
srwi t0, iacc016, 16 C t0 < 2^16
adde iacc032, iacc032, t0 C iacc032 < (2^18 + 2^16)
slwi t0, iacc048, 16
addc iacc032, iacc032, t0
C srwi iacc064, iacc048, 16 C iacc048 < 2^2 ???
addc iacc000, iacc000, cy000
stwu iacc000, 4(rp)
adde cy000, iacc032, cy032
C addze cy032, iacc064
mr r3, cy000
.LRet:
la r1,144(r1)
lfd f31,-128(r1)
lfd f30,-120(r1)
lfd f29,-112(r1)
lfd f28,-104(r1)
lwz r24,-64(r1)
lwz r25,-60(r1)
lwz r26,-56(r1)
lwz r27,-52(r1)
lwz r28,-48(r1)
lwz r29,-44(r1)
lwz r30,-40(r1)
lwz r31,-36(r1)
blr
C Alternative code for summing partial products:
C a0 = rl + cy000 + iacc000 + (iacc016 << 16);
C s1 = ((rl >> 16) + (cy000 >> 16) + (iacc000 >> 16) + (iacc016 & 0xffff));
C s1 += ((a0 >> 16) - s1) & 0xffff;
C t = (iacc048 << 16);
C cy000 = s1 + cy032 + iacc032 + (iacc016 >> 16) + t;
C cy032 = (iacc048 >> 16) + ((~a32 & t) >> 31);
EPILOGUE(mpn_addmul_4)
--=-=-=
Content-Type: text/plain; charset=iso-8859-1
Content-Transfer-Encoding: 8bit
--
Torbjörn
--=-=-=--