Improvements to powerpc32 asm code
   
    Torbjorn Granlund
     
    tege@swox.com
       
    01 Jun 2003 13:55:28 +0200
    
    
  
--=-=-=
Mark Rodenkirch <mrodenkirch@wi.rr.com> writes:
  My tests were done on a 7400.  I don't have any other CPUs to
  test it on.  I have attached my version of the code (below) if
  you are interested in comparing it to the new version.  I it
  quite possible that one version works better on the G4, while
  the other works better on the 604e.
  
Indeed.  I've tried to make my code run well on all ppc models.
Since 7400/7450 are now the most important powerpcs, we should
not sacrifice performance for them just for the case of avoiding
specialized code.
Unfortunately, the 7400 (aka G4) is less important than the 7450
(aka G4 too).  And their pipelines are quite different.  The
former has a 2 issue pipeline, while the latter can issue up to 3
instructions per cycle.  Latencies for 7450 are longer than for
7400.
All-in-all, the 7400 is much more similar to 750 (aka G3) than
to 7450.
  BTW, I am also looking at improving addmul_1.asm.  On my G4 I
  have it at a little over 7 cycles/limb (modeled after the
  powerpc64 routine), which is better than the current 8.5
  cycles/limb.  It contains a bug, so when I work that bug out, I
  might lose the added efficiency.  If anyone has already done
  that or has done it better, then I will stop working on it.
  
I have some preliminary code for mpn_addmul_1 which is a tad bit
faster than the code of GMP 4.1.2.  It might not work perfectly
yet.  Here it is:
--=-=-=
Content-Type: application/octet-stream
Content-Disposition: attachment; filename=addmul_1.asm
dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add
dnl the result to a second limb vector.
dnl Copyright 1995, 1997, 2000, 2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published by
dnl the Free Software Foundation; either version 2.1 of the License, or (at your
dnl option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
dnl MA 02111-1307, USA.
dnl INPUT PARAMETERS
dnl rp	r3
dnl up	r4
dnl n	r5
dnl vl	r6
dnl CPU		SPEED
dnl 601
dnl 603
dnl 604
dnl 604e	6.25
dnl 630/PWR3
dnl 640/PWR4
dnl 7x0/G3	7.25-12.85
dnl 7400/G4	7.25-12.85
dnl 7450/G4+
dnl While this loop is faster than the checked-in code, it needs more registers
dnl and thus has significant overhead for saving and restoring callee-saves
dnl registers.  For the current 604e MUL_KARATSUBA_THRESHOLD, it is
dnl questionable if this code is really useful.  Decreasing the number of live
dnl registers is the most important optimization of this code.
include(`../config.m4')
ASM_START()
PROLOGUE(xmpn_addmul_1)
	srwi	r0,r5,2
	mtctr	r0
	addi	r3,r3,-4	C adjust res_ptr, it's offset before it's used
	addi	r4,r4,-4	C adjust res_ptr, it's offset before it's used
	li	r12,0		C clear upper product reg
	addic	r0,r0,0		C clear cy
C Start software pipeline
	stmw	r26,-24(r1)	C save registers we are supposed to preserve
C Software pipelined main loop
	lwz	r8,4(r4)
	lwz	r9,8(r4)
	mullw	r30,r8,r6
	mulhwu	r0,r8,r6
	lwz	r8,12(r4)
	mullw	r10,r9,r6
	mulhwu	r12,r9,r6
	lwz	r26,4(r3)
	lwzu	r9,16(r4)
	mullw	r11,r8,r6
	adde	r31,r10,r0
	mulhwu	r0,r8,r6
	lwz	r27,8(r3)
	addze	r12,r12
	adde	r26,r26,r30
	adde	r27,r27,r31
	stw	r26,4(r3)
	stwu	r27,8(r3)
	bdz	.Lend1
.Loop:	lwz	r8,4(r4)
	mullw	r10,r9,r6
	adde	r28,r11,r12
	mulhwu	r12,r9,r6
	lwz	r5,4(r3)
	lwz	r9,8(r4)
	mullw	r11,r8,r6
	adde	r29,r10,r0
	mulhwu	r0,r8,r6
	lwz	r7,8(r3)
	lwz	r8,12(r4)
	mullw	r10,r9,r6
	adde	r30,r11,r12
	mulhwu	r12,r9,r6
	lwz	r26,12(r3)
	lwzu	r9,16(r4)
	mullw	r11,r8,r6
	adde	r31,r10,r0
	mulhwu	r0,r8,r6
	lwz	r27,16(r3)
	addze	r12,r12
	addc	r5,r5,r28
	adde	r7,r7,r29
	adde	r26,r26,r30
	adde	r27,r27,r31
	stw	r5,4(r3)
	stw	r7,8(r3)
	stw	r26,12(r3)
	stwu	r27,16(r3)
	bdnz	.Loop
C Finish software pipeline
.Lend1:
	mullw	r10,r9,r6
	adde	r28,r11,r12
	mulhwu	r12,r9,r6
	adde	r29,r10,r0
	addze	r12,r12
	lwz	r5,4(r3)
	lwz	r7,8(r3)
	addc	r5,r5,r28
	adde	r7,r7,r29
	stw	r5,4(r3)
	stw	r7,8(r3)
	addze	r3,r12
	lmw	r26,-24(r1)	C restore registers from stack
	blr
	blr
EPILOGUE(xmpn_addmul_1)
--=-=-=
(Note that testing potentially buggy assembly routines can cause
problems with the random number generators of the test programs,
since they rely on mpn_addmul_1.  I usually prepend an x before
names of new code, xmpn_addmul_1, and set things up so that test
programs (try.c, etc) call xmpn_addmul_1 instead of
mpn_addmul_1.  This typically requires passing
-D__gmpn_addmul_1=xmpn_addmul_1 to the compiler.)
If we want to get most out of the current 32-bit (and 64-bit)
powerpc chips, we should use floating-point operations, taking
advantage of the fmadd instruction.  Most powerpcs can sustain
about one fmadd per cycle (7400 can bizarrely sustain 3 fmadd
each 4 cycles, 7450 can sustain 4 fmadd each 5 cycles).
It should be possible to get mpn_addmul_1 faster with
floating-point, but the real gain will come by implementing
mpn_addmul_2, mpn_addmul_3, or even mpn_addmul_4.  Here is a
beginning for mpn_addmul_4:
--=-=-=
Content-Type: application/octet-stream
Content-Disposition: attachment; filename=addmul_N.asm
include(`../config.m4')
divert(-1)
C For good G3 and G4 multiply speed, we need to base the multiply code on
C fmadd, going to at least mpn_addmul_4.
C
C The 603 can sustain one fmadd every 2nd cycle.
C
C The 604 can sustain one fmadd per cycle.
C
C The 75x can sustain one fmadd every 2nd cycle.
C
C The 7400 and 7410 can sustain 3 fmadd in 4 cycles.
C
C The 744x and 745x family can sustain 4 fmadd in 5 cycles.
C
C We therefore get these mpn_addmul_N performance limits for the various
C processors:
C		current		speed limit	speed limit
C		addmul_1	int addmul_N	fp addmul_N
C 603		?		?-11		4
C 604		6.75		4		2
C 75x		8.7-14.3	?-11		4
C 7400/7410	8.7-14.3	?-11		2.67
C 744x/745x	9.5		4		2.5
C A major obstacle for fp based mpn_addmul_N is converting the fp product-sums
C to two 32-bit integers.  Operands will be limted as follows:
C N    1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
C 32  48  49  50  50  51  51  51  51  52  52  52  52  52  52  52  52
C 64  49  50  51  51  52  52  52  52  53  53  53  53  53  53  53  53
C Here is a trick that avoids using more fp operations:
C	fadd	a, a, 2^52
C	stfd	a, 0(r1)
C	lwz	r3, 0(r1)
C	lwz	r4, 4(r1)
C	rlwinm	r3,r3,0,0xfffff
C The idea is that adding 2^52 puts the bits we need in fixed positions, so
C that the low 32 bits of the fp number are also the 32 low integer bits.
define(`rp',`r3')
define(`up',`r4')
define(`n',`r5')
define(`vp',`r6')
C Free: r0
define(`iacc000',`r7')
define(`iacc016',`r8')
define(`iacc032',`r9')
define(`iacc048',`r10')
define(`cy000',`r11')
define(`cy032',`r12')
define(`t0',`r31')
define(`iacc064',`r30')
define(`none',`r29')
define(`none',`r28')
define(`none',`r27')
define(`none',`r26')
define(`none',`r25')
define(`none',`r24')
define(`none',`r23')
define(`none',`r22')
define(`none',`r21')
define(`none',`r20')
define(`v000',`f0')
define(`v016',`f1')
define(`v032',`f2')
define(`v048',`f3')
define(`v064',`f4')
define(`v080',`f5')
define(`v096',`f6')
define(`v112',`f7')
define(`acc000',`f8')
define(`acc016',`f9')
define(`acc032',`f10')
define(`acc048',`f11')
define(`acc064',`f12')
define(`acc080',`f13')
define(`acc096',`f31')
define(`acc112',`f30')
define(`two52',`f29')
define(`ft0',`f28')
divert
ifdef(`DARWIN',
	`define(`LO',`lo16($1)')
	 define(`HI',`hi16($1)')',
	`define(`LO',`$1@l')
	 define(`HI',`$1@ha')'
)
C DARWIN PIC:
C	mflr	tmpreg
C	bcl	20,31,here
C here:	mflr	basereg
C	mtlr	tmpreg
C	addis	xreg, basereg, ha16(caddr-here)
C	LDST	reg, lo16(caddr-here)(xreg)
C AIX:
C	LDST	reg, caddr(2)
C ELF non-PIC:
C	lis	xreg, caddr@ha
C	LDST	reg, caddr@l(xreg)
C ELF PIC:
C	.text
C caddr:.long	CONSTANT
C	mflr	tmpreg
C	bcl	20,31,here
C here:	mflr	basereg
C	mtlr	tmpreg
C	LDST	reg, caddr-here(basereg)
ASM_START()
	.data
	ALIGN(8)
.L0:	.long	0x43300000, 0x00000000
	.text
PROLOGUE(mpn_addmul_4)
	stfd	f31,-128(r1)
	stfd	f30,-120(r1)
	stfd	f29,-112(r1)
	stfd	f28,-104(r1)
	stw	r24,-64(r1)
	stw	r25,-60(r1)
	stw	r26,-56(r1)
	stw	r27,-52(r1)
	stw	r28,-48(r1)
	stw	r29,-44(r1)
	stw	r30,-40(r1)
	stw	r31,-36(r1)
	stwu	r1,-144(r1)
	mtctr	n
	lis	r0, 0x4330			C Keep upper 32 bits of the ...
	stw	r0, -8(r1)			C ... double 2^52 at -8(r1)
	lis	r5, HI(.L0)
	lfd	two52, LO(.L0)(r5)		C Keep 2^52 in two52
	lfd	acc032, LO(.L0)(r5)		C acc032 = 2^52
	lfd	acc048, LO(.L0)(r5)		C acc048 = 2^52
	lfd	acc064, LO(.L0)(r5)		C acc064 = 2^52
	lfd	acc080, LO(.L0)(r5)		C acc080 = 2^52
	lfd	acc096, LO(.L0)(r5)		C acc096 = 2^52
	lfd	acc112, LO(.L0)(r5)		C acc112 = 2^52
C Split v operand into 16 bits chunks, and convert the pieces to 8 IEEE doubles
	lwz	r7, 0(vp)
	rlwinm	t0,r7,0,16,31			C v bits 0..15
	stw	t0, -4(r1)
	lfd	ft0, -8(r1)
	fsub	v000, ft0, two52
	rlwinm	t0,r7,16,16,31			C v bits 16..31
	stw	t0, -4(r1)
	lfd	ft0, -8(r1)
	fsub	v016, ft0, two52
	lwz	r7, 4(vp)
	rlwinm	t0,r7,0,16,31			C v bits 32..47
	stw	t0, -4(r1)
	lfd	ft0, -8(r1)
	fsub	v032, ft0, two52
	rlwinm	t0,r7,16,16,31			C v bits 48..63
	stw	t0, -4(r1)
	lfd	ft0, -8(r1)
	fsub	v048, ft0, two52
	lwz	r7, 8(vp)
	rlwinm	t0,r7,0,16,31			C v bits 64..79
	stw	t0, -4(r1)
	lfd	ft0, -8(r1)
	fsub	v064, ft0, two52
	rlwinm	t0,r7,16,16,31			C v bits 80..95
	stw	t0, -4(r1)
	lfd	ft0, -8(r1)
	fsub	v080, ft0, two52
	lwz	r7, 12(vp)
	rlwinm	t0,r7,0,16,31			C v bits 96..111
	stw	t0, -4(r1)
	lfd	ft0, -8(r1)
	fsub	v096, ft0, two52
	rlwinm	t0,r7,16,16,31			C v bits 112..127
	stw	t0, -4(r1)
	lfd	ft0, -8(r1)
	fsub	v112, ft0, two52
	addi	up, up, -4
	addi	rp, rp, -4
	li	cy000, 0
	li	cy032, 0
C IDEA: Could combine "rlwinm iacc048" into the "srwi iacc064".
C MAIN LOOP
Loop:
	lwzu	r0, 4(up)
	stw	r0, -4(r1)
	lfd	ft0, -8(r1)
	fsub	ft0, ft0, two52
	fmadd	acc000, ft0, v000, acc032
	fmadd	acc016, ft0, v016, acc048
	fmadd	acc032, ft0, v032, acc064
	fmadd	acc048, ft0, v048, acc080
	fmadd	acc064, ft0, v064, acc096
	fmadd	acc080, ft0, v080, acc112
	fmadd	acc096, ft0, v096, two52		C two52 = 2^52
	fmadd	acc112, ft0, v112, two52		C two52 = 2^52
						C    64      32       0
	stfd	acc000, -32(r1)			C     |00_____|_______|
	stfd	acc016, -24(r1)			C |00_____|_______|
	lwz	iacc032, -32(r1)		C     |00_____|
	lwz	iacc000, -28(r1)		C             |_______|
	lwz	iacc048, -24(r1)		C |00_____|
	lwz	iacc016, -20(r1)		C         |_______|
	rlwinm	iacc032, iacc032, 0, 0xfffff
	rlwinm	iacc048, iacc048, 0, 0xfffff
						C iacc032,iacc048 < 2^18
	slwi	t0, iacc016, 16
	addc	iacc000, iacc000, t0
	srwi	t0, iacc016, 16			C t0 < 2^16
	adde	iacc032, iacc032, t0		C iacc032 < (2^18 + 2^16)
	slwi	t0, iacc048, 16
	addc	iacc032, iacc032, t0
	srwi	iacc064, iacc048, 16		C iacc064 < 2^2
	addze	iacc064, iacc064
	lwz	t0, 4(rp)
	addc	iacc000, iacc000, t0
	addze	iacc032, iacc032
	addze	iacc064, iacc064
	addc	iacc000, iacc000, cy000
	stwu	iacc000, 4(rp)
	adde	cy000, iacc032, cy032
	addze	cy032, iacc064
	bdnz	Loop
C END MAIN LOOP
C Collect acc032, acc048, acc064, acc080, acc096, acc112, cy000, and cy032
C and store them at rp[4], rp[8], rp[12], plus the return value.
	stfd	acc032, -32(r1)			C     |00_____|_______|
	stfd	acc048, -24(r1)			C |00_____|_______|
	lwz	iacc032, -32(r1)		C     |00_____|
	lwz	iacc000, -28(r1)		C             |_______|
	lwz	iacc048, -24(r1)		C |00_____|
	lwz	iacc016, -20(r1)		C         |_______|
	rlwinm	iacc032, iacc032, 0, 0xfffff
	rlwinm	iacc048, iacc048, 0, 0xfffff
	slwi	t0, iacc016, 16
	addc	iacc000, iacc000, t0
	srwi	t0, iacc016, 16			C t0 < 2^16
	adde	iacc032, iacc032, t0		C iacc032 < (2^18 + 2^16)
	slwi	t0, iacc048, 16
	addc	iacc032, iacc032, t0
	srwi	iacc064, iacc048, 16		C iacc064 < 2^2 ???
	addze	iacc064, iacc064
	addc	iacc000, iacc000, cy000
	stwu	iacc000, 4(rp)
	adde	cy000, iacc032, cy032
	addze	cy032, iacc064
	stfd	acc064, -32(r1)			C     |00_____|_______|
	stfd	acc080, -24(r1)			C |00_____|_______|
	lwz	iacc032, -32(r1)		C     |00_____|
	lwz	iacc000, -28(r1)		C             |_______|
	lwz	iacc048, -24(r1)		C |00_____|
	lwz	iacc016, -20(r1)		C         |_______|
	rlwinm	iacc032, iacc032, 0, 0xfffff
	rlwinm	iacc048, iacc048, 0, 0xfffff
	slwi	t0, iacc016, 16
	addc	iacc000, iacc000, t0
	srwi	t0, iacc016, 16			C t0 < 2^16
	adde	iacc032, iacc032, t0		C iacc032 < (2^18 + 2^16)
	slwi	t0, iacc048, 16
	addc	iacc032, iacc032, t0
	srwi	iacc064, iacc048, 16		C iacc064 < 2^2 ???
	addze	iacc064, iacc064
	addc	iacc000, iacc000, cy000
	stwu	iacc000, 4(rp)
	adde	cy000, iacc032, cy032
	addze	cy032, iacc064
	stfd	acc096, -32(r1)			C     |00_____|_______|
	stfd	acc112, -24(r1)			C |00_____|_______|
	lwz	iacc032, -32(r1)		C     |00_____|
	lwz	iacc000, -28(r1)		C             |_______|
	lwz	iacc048, -24(r1)		C |00_____|
	lwz	iacc016, -20(r1)		C         |_______|
	rlwinm	iacc032, iacc032, 0, 0xfffff
	rlwinm	iacc048, iacc048, 0, 0xfffff
	slwi	t0, iacc016, 16
	addc	iacc000, iacc000, t0
	srwi	t0, iacc016, 16			C t0 < 2^16
	adde	iacc032, iacc032, t0		C iacc032 < (2^18 + 2^16)
	slwi	t0, iacc048, 16
	addc	iacc032, iacc032, t0
C	srwi	iacc064, iacc048, 16		C iacc048 < 2^2 ???
	addc	iacc000, iacc000, cy000
	stwu	iacc000, 4(rp)
	adde	cy000, iacc032, cy032
C	addze	cy032, iacc064
	mr	r3, cy000
.LRet:
	la	r1,144(r1)
	lfd	f31,-128(r1)
	lfd	f30,-120(r1)
	lfd	f29,-112(r1)
	lfd	f28,-104(r1)
	lwz	r24,-64(r1)
	lwz	r25,-60(r1)
	lwz	r26,-56(r1)
	lwz	r27,-52(r1)
	lwz	r28,-48(r1)
	lwz	r29,-44(r1)
	lwz	r30,-40(r1)
	lwz	r31,-36(r1)
	blr
C Alternative code for summing partial products:
C      a0 = rl + cy000 + iacc000 + (iacc016 << 16);
C      s1 = ((rl >> 16) + (cy000 >> 16) + (iacc000 >> 16) + (iacc016 & 0xffff));
C      s1 += ((a0 >> 16) - s1) & 0xffff;
C      t = (iacc048 << 16);
C      cy000 = s1 + cy032 + iacc032 + (iacc016 >> 16) + t;
C      cy032 = (iacc048 >> 16) + ((~a32 & t) >> 31);
EPILOGUE(mpn_addmul_4)
--=-=-=
Content-Type: text/plain; charset=iso-8859-1
Content-Transfer-Encoding: 8bit
--
Torbjörn
--=-=-=--