IA64: non L1-hits for load operations at low-level functions (mpn_add_n)

Torbjorn Granlund tege at swox.com
Thu Oct 13 12:21:30 CEST 2005


usenett at gmx.de writes:

  I have a question about the low-level functions like mpn_add_n. It seems to
  me that the load-operations (in my case for ia64) are assumed to be L1-hits
  with a latency of 1 clock cycle. Consider the case, where a load-operation
  is a L2- or L3-hit. Could this case appear, and what for a behavior of the
  function is to expect?
  
In the case of the Itanic implementations to date, execution will
stall at each cache line border crossing for around 15 cycles.

If you like surreal experiences, you could hack the code in order
to improve the performance for huge additions by sticking some
lfetch instructions into the loop.  I had a go at it below.  With
some luck, this code (if somebody completes it) should run at 2
c/l from L1 cache, and not much worse from L2 and L3 cache.

	ld8		r16 = [r33], 8		C software pipeline feed in
	ld8		r17 = [r34], 8		C software pipeline feed in
	br.cloop.dptk	.Loop0			C B0
        br              .Lexit1

		ALIGN(32)
.Loop0:
.mmb
	ld8		r14 = [r33], 8		C M0
	ld8		r15 = [r34], 8		C M1
	nop.b		0			C B0
.mmb
   (p8)	add		r19 = r16, r17, 1	C M2
   (p9)	add		r19 = r16, r17		C M3
	nop.b		0			C B1
	;;
.mmi
	lfetch		...			C M0
	lfetch		...			C M1
   (p8)	cmp.leu		p6, p7 = r19, r16	C I0
.mmb
   (p9)	cmp.ltu		p6, p7 = r19, r16	C M2
	st8		[r32] = r19, 8		C M3
	br.cloop.dptk	.Loop1			C B0
	;;

   (p6)	add		r19 = r14, r15, 1
   (p7)	add		r19 = r14, r15
	;;
   (p6)	cmp.leu		p8, p9 = r19, r14
   (p7)	cmp.ltu		p8, p9 = r19, r14
	;;
   (p8)	mov		r8 = 1
   (p9)	mov		r8 = 0
	mov		ar.lc = r2
	br.ret.sptk.many b0

	ALIGN(32)
.Loop1:
.mmb
	ld8		r16 = [r33], 8		C M0
	ld8		r17 = [r34], 8		C M1
	nop.b		0			C B0
.mmb
   (p6)	add		r19 = r14, r15, 1	C M2
   (p7)	add		r19 = r14, r15		C M3
	nop.b		0			C B1
	;;
.mmi
	nop.m		0			C M0
	nop.m		0			C M1
   (p6)	cmp.leu		p8, p9 = r19, r14	C I0
.mmb
   (p7)	cmp.ltu		p8, p9 = r19, r14	C M2
	st8		[r32] = r19, 8		C M3
	br.cloop.dptk	.Loop0			C B0
	;;

.Lexit1:
   (p8)	add		r19 = r16, r17, 1
   (p9)	add		r19 = r16, r17
	;;
   (p8)	cmp.leu		p6, p7 = r19, r16
   (p9)	cmp.ltu		p6, p7 = r19, r16
	;;
   (p6)	mov		r8 = 1
   (p7)	mov		r8 = 0
	mov		ar.lc = r2
	br.ret.sptk.many b0

-- 
Torbjörn


More information about the gmp-discuss mailing list