[Gmp-commit] /var/hg/gmp: 3 new changesets

Fri Jul 12 18:53:26 CEST 2013

details:   /var/hg/gmp/rev/55ccd836a8ac
changeset: 15862:55ccd836a8ac
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Jul 11 22:39:24 2013 +0200
description:
Remove spurious ABI_SUPPORT decls.

details:   /var/hg/gmp/rev/c30b7c2da2ff
changeset: 15863:c30b7c2da2ff
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Jul 11 23:34:00 2013 +0200
description:
Tweak for better speed on K8, bobcat, bd1, NHM, Atom.

details:   /var/hg/gmp/rev/8de68af9fb4b
changeset: 15864:8de68af9fb4b
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Fri Jul 12 12:21:42 2013 +0200
description:
Partial rewrite.

diffstat:

 mpn/generic/sbpi1_div_sec.c |  94 +++++++++++++++++++++++++-------------------
 mpn/x86/cnd_aors_n.asm      |   3 -
 mpn/x86_64/cnd_aors_n.asm   |  46 +++++++++++-----------
 3 files changed, 77 insertions(+), 66 deletions(-)

diffs (282 lines):

diff -r 45c38e4c1de2 -r 8de68af9fb4b mpn/generic/sbpi1_div_sec.c

--- a/mpn/generic/sbpi1_div_sec.c	Fri Jul 05 14:45:17 2013 +0200
+++ b/mpn/generic/sbpi1_div_sec.c	Fri Jul 12 12:21:42 2013 +0200
@@ -8,7 +8,7 @@
    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
 
-Copyright 2011, 2012 Free Software Foundation, Inc.
+Copyright 2011, 2012, 2013 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -29,6 +29,28 @@
 #include "gmp-impl.h"
 #include "longlong.h"
 
+/* This side-channel silent division algorithm reduces the partial remainder by
+   GMP_NUMB_BITS/2 bits at a time, compared to GMP_NUMB_BITS for the main
+   division algorithm.  We do not insists on reducing by exactly
+   GMP_NUMB_BITS/2, but may leave a partial remainder that is D*B^i to 3D*B^i
+   too large (B is the limb base, D is the divisor, and i is the induction
+   variable); the subsequent step will handle the extra partial remainder bits.
+
+   WIth that partial remainder reduction, each step generates a quotient "half
+   limb".  The outer loop generates two quotient half limbs, an upper (q1h) and
+   a lower (q0h) which are stored sparsely in separate limb arrays.  These
+   arrays are added at the end; using separate arrays avoids data-dependent
+   carry propagation which could else pose a side-channel leakage problem.
+
+   The quotient half limbs may be between -3 to 0 from the accurate value
+   ("accurate" being the one which corresponds to a reduction to a principal
+   partial remainder).  Too small quotient half limbs correspond to too large
+   remainders, which we reduce later, as described above.
+
+   In order to keep quotients from getting too big, corresponding to a negative
+   partial remainder, we use an inverse which is sligtly smaller than usually.
+*/
+
 #if OPERATION_sbpi1_div_qr_sec
 /* Needs (dn + 1) + (nn - dn) + (nn - dn) = 2nn - dn + 1 limbs at tp. */
 #define FNAME mpn_sbpi1_div_qr_sec
@@ -49,7 +71,7 @@
        mp_limb_t dinv,
        mp_ptr tp)
 {
-  mp_limb_t nh, cy, q1h, q0h, dummy, h;
+  mp_limb_t nh, cy, q1h, q0h, dummy, cnd;
   mp_size_t i;
   mp_ptr hp;
 #if OPERATION_sbpi1_div_qr_sec
@@ -72,77 +94,69 @@
 #endif
     }
 
+  /* Decremenet inverse to keep quotient half limbs from being too large.  */
+  dinv -= dinv != 0;				/* FIXME: cmp-to-int */
+
   /* Create a divisor copy shifted half a limb.  */
   hp = tp;					/* (dn + 1) limbs */
-  cy = mpn_lshift (hp, dp, dn, GMP_NUMB_BITS / 2);
-  hp[dn] = dp[dn - 1] >> GMP_NUMB_BITS / 2;
+  hp[dn] = mpn_lshift (hp, dp, dn, GMP_NUMB_BITS / 2);
 
 #if OPERATION_sbpi1_div_qr_sec
   qlp = tp + (dn + 1);				/* (nn - dn) limbs */
   qhp = tp + (nn + 1);				/* (nn - dn) limbs */
 #endif
 
-  np += nn;
+  np += nn - dn;
+  nh = 0;
 
-  /* Main loop.  Develop one full limb per iteration, but do it in two steps in
-     order to avoid conditionals.  Quotient bits will be either correct or
-     underestimates.  When a quotient is underestimated, the next quotient will
-     compensate, since quotients are to be added at consecutive weight distance
-     GMP_NUMB_BITS/2.  We make two quotient arrays, each with GMP_NUMB_BITS/2+2
-     bits per entry.  The arrays are added late after the loop.  Separate
-     arrays avoid data-dependent carry propagation.  */
-  nh = 0;
   for (i = nn - dn - 1; i >= 0; i--)
     {
       np--;
 
-      nh = (nh << GMP_NUMB_BITS/2) + (np[0] >> GMP_NUMB_BITS/2);
+      nh = (nh << GMP_NUMB_BITS/2) + (np[dn] >> GMP_NUMB_BITS/2);
       umul_ppmm (q1h, dummy, nh, dinv);
       q1h += nh;
 #if OPERATION_sbpi1_div_qr_sec
       qhp[i] = q1h;
 #endif
-      cy = mpn_submul_1 (np - dn, hp, dn + 1, q1h);
+      mpn_submul_1 (np, hp, dn + 1, q1h);
 
-      nh = np[0];
+      nh = np[dn];
       umul_ppmm (q0h, dummy, nh, dinv);
       q0h += nh;
 #if OPERATION_sbpi1_div_qr_sec
       qlp[i] = q0h;
 #endif
-      cy = mpn_submul_1 (np - dn, dp, dn, q0h);
-
-      nh -= cy;
+      nh -= mpn_submul_1 (np, dp, dn, q0h);
     }
 
-  np[0] = nh;
+  /* 1st adjustment depends on extra high remainder limb.  */
+  cnd = nh != 0;				/* FIXME: cmp-to-int */
+#if OPERATION_sbpi1_div_qr_sec
+  qlp[0] += cnd;
+#endif
+  nh -= mpn_cnd_sub_n (cnd, np, np, dp, dn);
 
-  np -= dn;
-
-  /* 1st adjustment depends on extra high remainder limb.  */
-  h = np[dn];
-#if OPERATION_sbpi1_div_qr_sec
-  qlp[0] += h;
-#endif
-  h -= mpn_cnd_sub_n (h, np, np, dp, dn);
-
-  /* 2nd adjustment depends on remainder/divisor comparision as well as whether
+  /* 2nd adjustment depends on remainder/divisor comparison as well as whether
      extra remainder limb was nullified by previous subtract.  */
   cy = mpn_sub_n (np, np, dp, dn);
-  cy = cy == h;				/* FIXME: might leak on some archs */
+  cy = cy - nh;
 #if OPERATION_sbpi1_div_qr_sec
-  qlp[0] += cy;
+  qlp[0] += 1 - cy;
 #endif
-  mpn_cnd_add_n (1 - cy, np, np, dp, dn);
+  mpn_cnd_add_n (cy, np, np, dp, dn);
 
+  /* 3rd adjustment depends on remainder/divisor comparison.  */
+  cy = mpn_sub_n (np, np, dp, dn);
+#if OPERATION_sbpi1_div_qr_sec
+  qlp[0] += 1 - cy;
+#endif
+  mpn_cnd_add_n (cy, np, np, dp, dn);
+
+#if OPERATION_sbpi1_div_qr_sec
   /* Combine quotient halves into final quotient.  */
-#if OPERATION_sbpi1_div_qr_sec
-  qh = 0;
-  if (nn - dn != 0)
-    {
-      qh = mpn_lshift (qhp, qhp, nn - dn, GMP_NUMB_BITS/2);
-      qh += mpn_add_n (qp, qhp, qlp, nn - dn);
-    }
+  qh = mpn_lshift (qhp, qhp, nn - dn, GMP_NUMB_BITS/2);
+  qh += mpn_add_n (qp, qhp, qlp, nn - dn);
 
   return qh;
 #else
diff -r 45c38e4c1de2 -r 8de68af9fb4b mpn/x86/cnd_aors_n.asm
--- a/mpn/x86/cnd_aors_n.asm	Fri Jul 05 14:45:17 2013 +0200
+++ b/mpn/x86/cnd_aors_n.asm	Fri Jul 12 12:21:42 2013 +0200
@@ -51,9 +51,6 @@
 
 MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
 
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
 ASM_START()
 	TEXT
 	ALIGN(16)
diff -r 45c38e4c1de2 -r 8de68af9fb4b mpn/x86_64/cnd_aors_n.asm
--- a/mpn/x86_64/cnd_aors_n.asm	Fri Jul 05 14:45:17 2013 +0200
+++ b/mpn/x86_64/cnd_aors_n.asm	Fri Jul 12 12:21:42 2013 +0200
@@ -20,15 +20,15 @@
 include(`../config.m4')
 
 C	     cycles/limb
-C AMD K8,K9	 2.25
+C AMD K8,K9	 2
 C AMD K10	 2
-C AMD bd1	 3.55
-C AMD bobcat	 2.5
+C AMD bd1	 2.32
+C AMD bobcat	 3
 C Intel P4	13
 C Intel core2	 2.9
-C Intel NHM	 2.9
+C Intel NHM	 2.8
 C Intel SBR	 2.4
-C Intel atom	 6.5
+C Intel atom	 5.33
 C VIA nano	 3
 
 C NOTES
@@ -94,19 +94,19 @@
 L(b3):	mov	(vp,n,8), %r12
 	mov	8(vp,n,8), %r13
 	mov	16(vp,n,8), %r14
+	and	cnd, %r12
 	mov	(up,n,8), %r10
+	and	cnd, %r13
 	mov	8(up,n,8), %rbx
+	and	cnd, %r14
 	mov	16(up,n,8), %rbp
-	and	cnd, %r12
-	and	cnd, %r13
-	and	cnd, %r14
 	ADDSUB	%r12, %r10
+	mov	%r10, (rp,n,8)
 	ADCSBB	%r13, %rbx
+	mov	%rbx, 8(rp,n,8)
 	ADCSBB	%r14, %rbp
+	mov	%rbp, 16(rp,n,8)
 	sbb	R32(%rax), R32(%rax)	C save carry
-	mov	%r10, (rp,n,8)
-	mov	%rbx, 8(rp,n,8)
-	mov	%rbp, 16(rp,n,8)
 	add	$3, n
 	js	L(top)
 	jmp	L(end)
@@ -114,14 +114,14 @@
 L(b2):	mov	(vp,n,8), %r12
 	mov	8(vp,n,8), %r13
 	mov	(up,n,8), %r10
+	and	cnd, %r12
 	mov	8(up,n,8), %rbx
-	and	cnd, %r12
 	and	cnd, %r13
 	ADDSUB	%r12, %r10
+	mov	%r10, (rp,n,8)
 	ADCSBB	%r13, %rbx
+	mov	%rbx, 8(rp,n,8)
 	sbb	R32(%rax), R32(%rax)	C save carry
-	mov	%r10, (rp,n,8)
-	mov	%rbx, 8(rp,n,8)
 	add	$2, n
 	js	L(top)
 	jmp	L(end)
@@ -130,8 +130,8 @@
 	mov	(up,n,8), %r10
 	and	cnd, %r12
 	ADDSUB	%r12, %r10
+	mov	%r10, (rp,n,8)
 	sbb	R32(%rax), R32(%rax)	C save carry
-	mov	%r10, (rp,n,8)
 	add	$1, n
 	jns	L(end)
 
@@ -140,24 +140,24 @@
 	mov	8(vp,n,8), %r13
 	mov	16(vp,n,8), %r14
 	mov	24(vp,n,8), %r11
+	and	cnd, %r12
 	mov	(up,n,8), %r10
+	and	cnd, %r13
 	mov	8(up,n,8), %rbx
+	and	cnd, %r14
 	mov	16(up,n,8), %rbp
+	and	cnd, %r11
 	mov	24(up,n,8), %r9
-	and	cnd, %r12
-	and	cnd, %r13
-	and	cnd, %r14
-	and	cnd, %r11
 	add	R32(%rax), R32(%rax)	C restore carry
 	ADCSBB	%r12, %r10
+	mov	%r10, (rp,n,8)
 	ADCSBB	%r13, %rbx
+	mov	%rbx, 8(rp,n,8)
 	ADCSBB	%r14, %rbp
+	mov	%rbp, 16(rp,n,8)
 	ADCSBB	%r11, %r9
+	mov	%r9, 24(rp,n,8)
 	sbb	R32(%rax), R32(%rax)	C save carry
-	mov	%r10, (rp,n,8)
-	mov	%rbx, 8(rp,n,8)
-	mov	%rbp, 16(rp,n,8)
-	mov	%r9, 24(rp,n,8)
 	add	$4, n
 	js	L(top)