[Gmp-commit] /home/hgfiles/gmp: 3 new changesets

Sat Dec 18 00:41:28 CET 2010

details:   /home/hgfiles/gmp/rev/ee84b258b5dd
changeset: 13712:ee84b258b5dd
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed Dec 15 22:13:00 2010 +0100
description:
Rewrite popcount to use vperm count table.

details:   /home/hgfiles/gmp/rev/273346d8aa68
changeset: 13713:273346d8aa68
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed Dec 15 22:14:38 2010 +0100
description:
Misc cleanups.

details:   /home/hgfiles/gmp/rev/5995c8359734
changeset: 13714:5995c8359734
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Dec 16 21:31:10 2010 +0100
description:
Rewrite x86_64 mod_34lsub1.asm.

diffstat:

 ChangeLog                           |   10 +
 mpn/powerpc64/vmx/popcount.asm      |  154 ++++++++++----------------
 mpn/x86_64/mod_34lsub1.asm          |  207 +++++++++++++++++++----------------
 mpn/x86_64/pentium4/mod_34lsub1.asm |  150 ++++++++++++++++++++++++++
 mpn/x86_64/popham.asm               |  148 ++++++++++++------------
 5 files changed, 402 insertions(+), 267 deletions(-)

diffs (truncated from 883 to 300 lines):

diff -r b5cfc709ff71 -r 5995c8359734 ChangeLog

--- a/ChangeLog	Tue Dec 14 13:30:39 2010 +0100
+++ b/ChangeLog	Thu Dec 16 21:31:10 2010 +0100
@@ -1,3 +1,13 @@
+2010-12-16  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86_64/mod_34lsub1.asm: Complete rewrite.
+	* mpn/x86_64/pentium4/mod_34lsub1.asm: New file, old
+	mpn/x86_64/mod_34lsub1.asm.
+
+2010-12-15  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/powerpc64/vmx/popcount.asm: Rewrite to use vperm count table.
+
 2010-12-14  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mp-h.in: Remove.
diff -r b5cfc709ff71 -r 5995c8359734 mpn/powerpc64/vmx/popcount.asm
--- a/mpn/powerpc64/vmx/popcount.asm	Tue Dec 14 13:30:39 2010 +0100
+++ b/mpn/powerpc64/vmx/popcount.asm	Thu Dec 16 21:31:10 2010 +0100
@@ -1,6 +1,6 @@
 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
 
-dnl  Copyright 2006 Free Software Foundation, Inc.
+dnl  Copyright 2006, 2010 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -20,17 +20,16 @@
 include(`../config.m4')
 
 C                   cycles/limb
-C 7400,7410 (G4):       2.75
-C 744x,745x (G4+):      2.25
-C 970 (G5):             5.3
+C 7400,7410 (G4):       ?
+C 744x,745x (G4+):      1.125
+C 970 (G5):             2.25
 
 C STATUS
 C  * Works for all sizes and alignments.
 
 C TODO
-C  * Tune the awkward huge n outer loop code.
+C  * Rewrite the awkward huge n outer loop code.
 C  * Two lvx, two vperm, and two vxor could make us a similar hamdist.
-C  * For the 970, a combined VMX+intop approach might be best.
 C  * Compress cnsts table in 64-bit mode, only half the values are needed.
 
 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
@@ -39,26 +38,11 @@
 
 define(`OPERATION_popcount')
 
-ifdef(`OPERATION_popcount',`
-  define(`func',`mpn_popcount')
-  define(`up',		`r3')
-  define(`n',		`r4')
-  define(`HAM',		`dnl')
-')
-ifdef(`OPERATION_hamdist',`
-  define(`func',`mpn_hamdist')
-  define(`up',		`r3')
-  define(`vp',		`r4')
-  define(`n',		`r5')
-  define(`HAM',		`$1')
-')
+define(`ap',	`r3')
+define(`n',	`r4')
 
-define(`x01010101',`v2')
-define(`x00110011',`v7')
-define(`x00001111',`v10')
-define(`cnt1',`v11')
-define(`cnt2',`v12')
-define(`cnt4',`v13')
+define(`rtab',	`v10')
+define(`cnt4',	`v11')
 
 ifelse(GMP_LIMB_BITS,32,`
 	define(`LIMB32',`	$1')
@@ -85,30 +69,29 @@
 C Load various constants into vector registers
 	LEAL(	r11, cnsts)
 	li	r12, 16
-	vspltisb cnt1, 1		C 0x0101...01 used as shift count
-	vspltisb cnt2, 2		C 0x0202...02 used as shift count
 	vspltisb cnt4, 4		C 0x0404...04 used as shift count
-	lvx	x01010101, 0, r11	C 0x3333...33
-	lvx	x00110011, r12, r11	C 0x5555...55
-	vspltisb x00001111, 15		C 0x0f0f...0f
+
+	li	r7, 160
+	lvx	rtab, 0, r11
 
 LIMB64(`lis	r0, LIMBS_CHUNK_THRES	')
 LIMB64(`cmpd	cr7, n, r0		')
 
-	lvx	v0, 0, up
-	addi	r7, r11, 96
-	rlwinm	r6, up, 2,26,29
+	lvx	v0, 0, ap
+	addi	r7, r11, 80
+	rlwinm	r6, ap, 2,26,29
 	lvx	v8, r7, r6
 	vand	v0, v0, v8
 
-LIMB32(`rlwinm	r8, up, 30,30,31	')
-LIMB64(`rlwinm	r8, up, 29,31,31	')
-	add	n, n, r8		C compensate n for rounded down `up'
+LIMB32(`rlwinm	r8, ap, 30,30,31	')
+LIMB64(`rlwinm	r8, ap, 29,31,31	')
+	add	n, n, r8		C compensate n for rounded down `ap'
 
 	vxor	v1, v1, v1
 	li	r8, 0			C grand total count
 
-	vxor	v3, v3, v3		C zero total count
+	vxor	v12, v12, v12		C zero total count
+	vxor	v13, v13, v13		C zero total count
 
 	addic.	n, n, -LIMBS_PER_VR
 	ble	L(sum)
@@ -120,82 +103,61 @@
 LIMB64(`ble	cr7, L(small)		')
 LIMB64(`addis	r9, n, -LIMBS_PER_CHUNK	') C remaining n
 LIMB64(`lis	n, LIMBS_PER_CHUNK	')
+
+	ALIGN(16)
 L(small):
-
-
 LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
 LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
 	addi	r7, r7, 1
 	mtctr	r7			C copy n to count register
 	b	L(ent)
 
-	ALIGN(8)
-L(top):	lvx	v0, 0, up
-	li	r7, 128			C prefetch distance
-L(ent):	lvx	v1, r12, up
-	addi	up, up, 32
-	vsr	v4, v0, cnt1
-	vsr	v5, v1, cnt1
-	dcbt	up, r7			C prefetch
-	vand	v8, v4, x01010101
-	vand	v9, v5, x01010101
-	vsububm	v0, v0, v8		C 64 2-bit accumulators (0..2)
-	vsububm	v1, v1, v9		C 64 2-bit accumulators (0..2)
-	vsr	v4, v0, cnt2
-	vsr	v5, v1, cnt2
-	vand	v8, v0, x00110011
-	vand	v9, v1, x00110011
-	vand	v4, v4, x00110011
-	vand	v5, v5, x00110011
-	vaddubm	v0, v4, v8		C 32 4-bit accumulators (0..4)
-	vaddubm	v1, v5, v9		C 32 4-bit accumulators (0..4)
-	vaddubm	v8, v0, v1		C 32 4-bit accumulators (0..8)
-	vsr	v9, v8, cnt4
-	vand	v6, v8, x00001111
-	vand	v9, v9, x00001111
-	vaddubm	v6, v9, v6		C 16 8-bit accumulators (0..16)
-	vsum4ubs v3, v6, v3		C sum 4 x 4 bytes into 4 32-bit fields
+	ALIGN(16)
+L(top):
+	lvx	v0, 0, ap
+L(ent):	lvx	v1, r12, ap
+	addi	ap, ap, 32
+	vsrb	v8, v0, cnt4
+	vsrb	v9, v1, cnt4
+	vperm	v2, rtab, rtab, v0
+	vperm	v3, rtab, rtab, v8
+	vperm	v4, rtab, rtab, v1
+	vperm	v5, rtab, rtab, v9
+	vaddubm	v6, v2, v3
+	vaddubm	v7, v4, v5
+	vsum4ubs v12, v6, v12
+	vsum4ubs v13, v7, v13
 	bdnz	L(top)
 
 	andi.	n, n, eval(LIMBS_PER_2VR-1)
 	beq	L(rt)
 
-	lvx	v0, 0, up
+	lvx	v0, 0, ap
 	vxor	v1, v1, v1
 	cmpwi	n, LIMBS_PER_VR
 	ble	L(sum)
 L(lsum):
 	vor	v1, v0, v0
-	lvx	v0, r12, up
+	lvx	v0, r12, ap
 L(sum):
 LIMB32(`rlwinm	r6, n, 4,26,27	')
 LIMB64(`rlwinm	r6, n, 5,26,26	')
-	addi	r7, r11, 32
+	addi	r7, r11, 16
 	lvx	v8, r7, r6
 	vand	v0, v0, v8
+	vsrb	v8, v0, cnt4
+	vsrb	v9, v1, cnt4
+	vperm	v2, rtab, rtab, v0
+	vperm	v3, rtab, rtab, v8
+	vperm	v4, rtab, rtab, v1
+	vperm	v5, rtab, rtab, v9
+	vaddubm	v6, v2, v3
+	vaddubm	v7, v4, v5
+	vsum4ubs v12, v6, v12
+	vsum4ubs v13, v7, v13
 
-	vsr	v4, v0, cnt1
-	vsr	v5, v1, cnt1
-	vand	v8, v4, x01010101
-	vand	v9, v5, x01010101
-	vsububm	v0, v0, v8		C 64 2-bit accumulators (0..2)
-	vsububm	v1, v1, v9		C 64 2-bit accumulators (0..2)
-	vsr	v4, v0, cnt2
-	vsr	v5, v1, cnt2
-	vand	v8, v0, x00110011
-	vand	v9, v1, x00110011
-	vand	v4, v4, x00110011
-	vand	v5, v5, x00110011
-	vaddubm	v0, v4, v8		C 32 4-bit accumulators (0..4)
-	vaddubm	v1, v5, v9		C 32 4-bit accumulators (0..4)
-	vaddubm	v8, v0, v1		C 32 4-bit accumulators (0..8)
-	vsr	v9, v8, cnt4
-	vand	v6, v8, x00001111
-	vand	v9, v9, x00001111
-	vaddubm	v6, v9, v6		C 16 8-bit accumulators (0..16)
-	vsum4ubs v3, v6, v3		C sum 4 x 4 bytes into 4 32-bit fields
-
-L(rt):
+	ALIGN(16)
+L(rt):	vadduwm	v3, v12, v13
 	li	r7, -16			C FIXME: does all ppc32 and ppc64 ABIs
 	stvx	v3, r7, r1		C FIXME: ...support storing below sp?
 
@@ -210,7 +172,8 @@
 
 C Handle outer loop for huge n.  We inherit cr7 and r0 from above.
 LIMB64(`ble	cr7, L(ret)
-	vxor	v3, v3, v3		C zero total count
+	vxor	v12, v12, v12		C zero total count
+	vxor	v13, v13, v13		C zero total count
 	mr	n, r9
 	cmpd	cr7, n, r0
 	ble	cr7, L(2)
@@ -221,17 +184,16 @@
 	b	L(top)
 ')
 
+	ALIGN(16)
 L(ret):	mr	r3, r8
 	mtspr	256, r10
 	blr
 EPILOGUE()
 
 DEF_OBJECT(cnsts,16)
-	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
-	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
-
-	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
-	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+C Counts for vperm
+	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
 C Masks for high end of number
 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
diff -r b5cfc709ff71 -r 5995c8359734 mpn/x86_64/mod_34lsub1.asm
--- a/mpn/x86_64/mod_34lsub1.asm	Tue Dec 14 13:30:39 2010 +0100
+++ b/mpn/x86_64/mod_34lsub1.asm	Thu Dec 16 21:31:10 2010 +0100
@@ -1,64 +1,43 @@
 dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
 
-dnl  Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation,
-dnl  Inc.
-dnl
+dnl  Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010 Free Software
+dnl  Foundation, Inc.
+
 dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or
-dnl  modify it under the terms of the GNU Lesser General Public License as
-dnl  published by the Free Software Foundation; either version 3 of the
-dnl  License, or (at your option) any later version.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful,
-dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-dnl  Lesser General Public License for more details.
-dnl
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+