[Gmp-commit] /var/hg/gmp: Provide mpn_sbpi1_bdiv_r for bwl/skl/zen.
mercurial at gmplib.org
mercurial at gmplib.org
Sun Jan 17 21:19:29 UTC 2021
details: /var/hg/gmp/rev/c452b2e6f681
changeset: 18198:c452b2e6f681
user: Torbjorn Granlund <tg at gmplib.org>
date: Sun Jan 17 22:19:23 2021 +0100
description:
Provide mpn_sbpi1_bdiv_r for bwl/skl/zen.
diffstat:
mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm | 710 +++++++++++++++++++++++++++++++++++
1 files changed, 710 insertions(+), 0 deletions(-)
diffs (truncated from 714 to 300 lines):
diff -r 5e97d3815b09 -r c452b2e6f681 mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm Sun Jan 17 22:19:23 2021 +0100
@@ -0,0 +1,710 @@
+dnl AMD64 mpn_sbpi1_bdiv_r optimised for Intel Broadwell.
+
+dnl Copyright 2015, 2021 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 addmul_1
+C AMD K8,K9 n/a n/a
+C AMD K10 n/a n/a
+C AMD bd1 n/a n/a
+C AMD bd2 n/a n/a
+C AMD bd3 n/a n/a
+C AMD bd4 ? ?
+C AMD zn1 ? ?
+C AMD zn2 ? ?
+C AMD zn3 ? ?
+C AMD bt1 n/a n/a
+C AMD bt2 n/a n/a
+C Intel P4 n/a n/a
+C Intel PNR n/a n/a
+C Intel NHM n/a n/a
+C Intel SBR n/a n/a
+C Intel IBR n/a n/a
+C Intel HWL 1.68 n/a
+C Intel BWL 1.51 1.67-1.74
+C Intel SKL 1.52 1.63-1.71
+C Intel atom n/a n/a
+C Intel SLM n/a n/a
+C VIA nano n/a n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Do overlapped software pipelining.
+C * Reduce register use, i.e., by combining n_neg and n_save.
+C * Supporess initial store through up, it's always a zero.
+C * Streamline up and dp setup.
+C * When changing this, make sure the code which falls into the inner loops
+C does not execute too many no-ops (for both PIC and non-PIC).
+
+dnl mp_limb_t
+dnl mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un,
+dnl mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+
+define(`up', `%rdi')
+define(`un', `%rsi')
+define(`dp_param',`%rdx')
+define(`dn_param',`%rcx')
+define(`dinv', `%r8')
+
+define(`n', `%rcx')
+define(`n_save', `%rbp')
+define(`dp', `%r14')
+define(`n_neg', `%rbx')
+define(`q', `%rdx')
+define(`jaddr', `%rax')
+
+define(`w0', `%r12')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ifdef(`MAX_SPECIAL',,`
+define(`MAX_SPECIAL', 8)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+
+ lea L(atab)(%rip), %r10
+
+ cmp $MAX_SPECIAL, dn_param
+ jbe L(sma)
+
+ifelse(MAX_SPECIAL,8,,`
+forloop(i,eval(MAX_SPECIAL+1),9,`L(i):
+')')
+
+L(gen): push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+
+ sub dn_param, un C outer loop count
+
+ lea -8(,dn_param,8), n_neg
+ neg n_neg
+ mov dn_param, n_save
+ mov R32(dn_param), R32(%rax)
+ shr $3, n_save C loop count
+ and $7, R32(%rax) C clear CF and OF as side-effect
+
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %rax
+ lea (%rax,%r10), jaddr
+',`
+ mov (%r10,%rax,8), jaddr
+')
+ mov (up), q
+ imul dinv, q
+ jmp L(outer)
+
+L(f0): mulx( (dp), w2, w3)
+ lea -1(n), n
+ mulx( 8,(dp), w0, w1)
+ lea -8(dp), dp
+ adcx( w3, w0)
+ adox( (up), w2)
+ lea -8(up), up
+ jmp L(b0x)
+
+L(f3): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -48(up), up
+ lea 16(dp), dp
+ jmp L(b3x)
+
+L(f4): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 24(dp), dp
+ adox( (up), w2)
+ lea -40(up), up
+ adcx( w3, w0)
+ jmp L(b4x)
+
+L(f5): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ lea 32(dp), dp
+ adcx( w1, w2)
+ adox( (up), w0)
+ lea -32(up), up
+ jmp L(b5x)
+
+L(f6): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 40(dp), dp
+ adox( (up), w2)
+ lea -24(up), up
+ adcx( w3, w0)
+ jmp L(b6x)
+
+L(f7): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ lea 48(dp), dp
+ adcx( w1, w2)
+ adox( (up), w0)
+ lea -16(up), up
+ jmp L(b7x)
+
+L(f1): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -1(n), n
+ jmp L(b1x)
+
+L(f2): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 8(dp), dp
+ adox( (up), w2)
+ lea 8(up), up
+ adcx( w3, w0)
+ jmp L(b2x)
+
+L(end): adox( (up), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+ mov w0, (up)
+ adc %rcx, w1 C relies on rcx = 0
+ mov 8(up,n_neg), q C Compute next quotient early...
+ mulx( dinv, q, %r12) C ...(unused in last iteration)
+ bt $0, R32(%r13)
+ adc w1, 8(up)
+ setc R8(%r13)
+ dec un C clear OF as side-effect
+ jz L(done)
+
+ lea (dp,n_neg), dp C reset dp to D[]'s beginning
+ lea 8(up,n_neg), up C point up to U[]'s current beginning
+L(outer):
+ mov n_save, n
+ test %eax, %eax C clear CF and OF
+ jmp *jaddr
+
+ ALIGN(16)
+L(top): adox( -8,(up), w2)
+ adcx( w3, w0)
+ mov w2, -8(up)
+ jrcxz L(end)
+L(b2x): mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -1(n), n
+ mov w0, (up)
+L(b1x): adcx( w1, w2)
+ mulx( 16,(dp), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(up), w2)
+ mov w2, 8(up)
+L(b0x): mulx( 24,(dp), w2, w3)
+ lea 64(dp), dp
+ adcx( w1, w2)
+ adox( 16,(up), w0)
+ mov w0, 16(up)
+L(b7x): mulx( -32,(dp), w0, w1)
+ adox( 24,(up), w2)
+ adcx( w3, w0)
+ mov w2, 24(up)
+L(b6x): mulx( -24,(dp), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(up), w0)
+ mov w0, 32(up)
+L(b5x): mulx( -16,(dp), w0, w1)
+ adox( 40,(up), w2)
+ adcx( w3, w0)
+ mov w2, 40(up)
+L(b4x): adox( 48,(up), w0)
+ mulx( -8,(dp), w2, w3)
+ mov w0, 48(up)
+L(b3x): lea 64(up), up
+ adcx( w1, w2)
+ mulx( (dp), w0, w1)
+ jmp L(top)
+
+L(done):mov %r13, %rax
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(sma):
+ifdef(`PIC',
+` movslq 28(%r10,dn_param,4), %rax
+ lea (%rax,%r10), jaddr
+',`
+ mov 56(%r10,dn_param,8), jaddr
+')
+ jmp *jaddr
+
+L(1): mov (dp_param), %r10
+ xor R32(%rax), R32(%rax)
+ mov (up), %rdx
+ dec un
+ mov %rdx, %r9
+L(o1): mulx( dinv, %rdx, %r11) C next quotient
+ lea 8(up), up
+ mulx( %r10, %rcx, %rdx) C 0 1
+ add %r9, %rcx C 0
+ adc %rax, %rdx C 1
+ add (up), %rdx C 1
+ setc R8(%rax) C 2
+ mov %rdx, %r9 C 1
+ dec un
+ jnz L(o1)
+ mov %r9, (up)
+
+ FUNC_EXIT()
+ ret
More information about the gmp-commit
mailing list