386 optimized bitblit code

Sat Jan 31 14:29:27 CET 2004

Attached is a first cut at 386-optimized code.  This code runs at 1.894 
clocks/bit for short blits, or about 4.6% slower than the pure C code (due 
primarily to the extra branch I had to put it).  But for long blits it 
runs at 0.149 clocks/bit, or about 36% faster than the C only version.

-- 
"Usenet is like a herd of performing elephants with diarrhea -- massive,
difficult to redirect, awe-inspiring, entertaining, and a source of
mind-boggling amounts of excrement when you least expect it."
                                - Gene Spafford 
Brian
-------------- next part --------------
/* mpn_bitblit -- copy a sequence of bits

Copyright 1991, 1993, 1994, 1996, 2000, 2001, 2002 Free Software Foundation,
Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111-1307, USA. */

#include <string.h>
#include "gmp.h"
#include "gmp-impl.h"

/* Move len bits starting at bit src_off from src to dst starting at bit
 * dst_off, so that bit src_off+i from src becomes bit
 * dst_off+i in dst, for all i, 0 <= i < len.
 *
 * When source and destination overlap, the routine behaves as if
 * the bits were first copied to a seperate (non-overlapping) buffer, and
 * then copied from that buffer into the destination.  In other words,
 * this routine acts like memmove().
 *
 * Note that only bits dst_off to dst_off + len - 1 inclusive are modified-
 * bits dst_off - 1 and dst_off + len are left unmodified.
 *
 * Bits are numbered from low-order to high order, starting at bit 0.
 * So bit 0 is the lowest ordered bit in word 0 of source, bit number
 * GMP_NUMB_BITS-1 is the highest ordered bit in word 0, bit number
 * GMP_NUMB_BITS is the lowest ordered bit in word 1, etc.  The byte order
 * words are stored in is not considered.  Thus, while this routine
 * satisifies all practical needs of GMP, it is not a general-purpose
 * bitblit routine.
 *
 * There is a gaurentee that the routine will never read beyond the
 * word in src that bit (src_off + len - 1) is in.  Ever.  This means that
 * the routine needs to special-case the last word.
 *
 * This source code assumes a binary system- i.e. that integers are some
 * number of bits.  BCD, Base-10, trinary, and other non-binary systems
 * will need special implementations.  Note that it does not assume that
 * the number of bits is a power of two (16, 32, or 64)- 24 bit, 60 bit,
 * 17 bit, etc. systems should work just fine.
 */

#if 0

/* The reference implementation of the routine.  All implementations are
 * gaurenteed to behave the same, just faster.  Specifically, routines
 * are not allowed to read words this routine doesn't read, and not
 * allowed to write words this routine doesn't write.
 */

#define LIMB_OF(x) ((x)/GMP_NUMB_BITS)
#define BIT_OF(x) ((x) % GMP_NUMB_BITS)
#define BIT(x) (((mp_limb_t) 1) << BIT_OF(x))

void
mpn_bitblit (mp_ptr dst, size_t dst_off, mp_srcptr src, size_t src_off,
             size_t len)
{
    size_t i;

    if ((dst < src) || ((dst == src) && (dst_off <= src_off)) ||
        (dst > (src + LIMB_OF(src_off + len - 1)))) {
        /* copy forward */
        for (i = 0; i < len; ++i) {
            if ((src[LIMB_OF(src_off + i)] & BIT(src_off + i)) != 0)
            {
                /* Set bit */
                dst[LIMB_OF(dst_off + i)] |= BIT(dst_off + i);
            }
            else {
                /* Clear bit */
                dst[LIMB_OF(dst_off + i)] &= ~BIT(dst_off + i);
            }
        }
    }
    else
    {
        /* copy backwards */
        for (i = len; i > 0 ; --i) {
            if ((src[LIMB_OF(src_off + i - 1)] & BIT(src_off + i - 1)) != 0)
            {
                /* Set bit */
                dst[LIMB_OF(dst_off + i - 1)] |= BIT(dst_off + i - 1);
            }
            else {
                /* Clear bit */
                dst[LIMB_OF(dst_off + i - 1)] &= ~BIT(dst_off + i - 1);
            }
        }
    }
}

#endif /* 0 */

/* A word on shifts.  We can not assume how shifts outside the range of 0
 * to GMP_NUMB_BITS-1 work.  Specifically, we can not assume that
 * (x << GMP_NUMB_BITS) == 0 for x != 0 is true, as convient as that might
 * be.  Many systems modulo the shift distance by GMP_NUMB_BITS.  So
 * (x << GMP_NUMB_BITS) becomes (x << (GMP_NUMB_BITS % GMP_NUMB_BITS))
 * becomes (x << 0) becomes x.  Opps.  Likewise, (x << -1) does not
 * necessarily equal either 0 or (x >> 1) or any sensible result.  So we
 * have to be very carefull with our shifts.
 *
 * Note that we do assume that (x << 0) == (x >> 0) == x for all x.
 */

static const mp_limb_t bitmask_lo_table[32] = {
    0x00000001, 0x00000003, 0x00000007, 0x0000000F,
    0x0000001F, 0x0000003F, 0x0000007F, 0x000000FF,
    0x000001FF, 0x000003FF, 0x000007FF, 0x00000FFF,
    0x00001FFF, 0x00003FFF, 0x00007FFF, 0x0000FFFF,
    0x0001FFFF, 0x0003FFFF, 0x0007FFFF, 0x000FFFFF,
    0x001FFFFF, 0x003FFFFF, 0x007FFFFF, 0x00FFFFFF,
    0x01FFFFFF, 0x03FFFFFF, 0x07FFFFFF, 0x0FFFFFFF,
    0x1FFFFFFF, 0x3FFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF
};

static const mp_limb_t bitmask_hi_table[32] = {
    0xFFFFFFFFul, 0xFFFFFFFEul, 0xFFFFFFFCul, 0xFFFFFFF8ul,
    0xFFFFFFF0ul, 0xFFFFFFE0ul, 0xFFFFFFC0ul, 0xFFFFFF80ul,
    0xFFFFFF00ul, 0xFFFFFE00ul, 0xFFFFFC00ul, 0xFFFFF800ul,
    0xFFFFF000ul, 0xFFFFE000ul, 0xFFFFC000ul, 0xFFFF8000ul,
    0xFFFF0000ul, 0xFFFE0000ul, 0xFFFC0000ul, 0xFFF80000ul,
    0xFFF00000ul, 0xFFE00000ul, 0xFFC00000ul, 0xFF800000ul,
    0xFF000000ul, 0xFE000000ul, 0xFC000000ul, 0xF8000000ul,
    0xF0000000ul, 0xE0000000ul, 0xC0000000ul, 0x80000000ul
};

#define BITMASK_LO(x) (bitmask_lo_table[(x)])
#define BITMASK_HI(x) (bitmask_hi_table[(x)])
#define BITMASK(lo, hi) (BITMASK_LO(hi) & BITMASK_HI(lo))

/* The BITMASK*_ASSERT()s are asserts to make sure we're not out of bounds
 * for our shifts.  I'm paranoid.
 */
#define BITMASK_LO_ASSERT(x) do { \
    ASSERT((x) < GMP_NUMB_BITS); \
} while (0)
#define BITMASK_HI_ASSERT(x) do { \
    ASSERT ((x) < GMP_NUMB_BITS); \
} while (0)
#define BITMASK_ASSERT(lo,hi) do { \
    ASSERT((lo) <= (hi)); \
    ASSERT((hi) < GMP_NUMB_BITS); \
} while (0)

/* Note that we use div (/) and mod (%) instead of shifts and ands here.
 * Every compiler I have ever looked at for systems where ints are a
 * power of two bits turns the divs and mods into shifts and ands when
 * any optimization is turned on.  So there is no performance advantage
 * to doing an optimization the compiler is perfectly capable of.  But using
 * divs and ands allows 24-bit computers (Harris minis, others),
 * 60-bit computers (older Crays), etc. to work correctly out of the box.
 */

/* What limb a particular bit is in */
#define LIMB_OF(bitno) ((bitno) / GMP_NUMB_BITS)

/* What bit in the limb a particular bit is */
#define BIT_OF(bitno) ((bitno) % GMP_NUMB_BITS)

/* The most likely place to optimize this routine is replacing out shift
 * routines with inline assembly.  So we do a macro to make this easier.
 * DBL_SHIFT shifts a word to the right by a given distance, shifting in
 * the new high bits from another word.
 */
#if (1)
#define DBL_SHIFT(hibits, lobits, dist) (((lobits) >> (dist)) | ((hibits) << (GMP_NUMB_BITS - (dist))))
#else
/* An example assembler optimization, for the x86 using GCC/GAS */
static __inline__ mp_limb_t DBL_SHIFT(
    mp_limb_t hibits,
    mp_limb_t lobits,
    size_t dist)
{
    mp_limb_t res;
    ASSERT (dist < GMP_NUMB_BITS);
    __asm__ __volatile__ ("shrdl	%b2,%1,%0" : "=rm" (res) : "r" (hibits), "c" (dist), "0" (lobits));
    return res;
}
#endif

void
mpn_bitblit (mp_ptr dst, size_t dst_off, mp_srcptr src, size_t src_off,
             size_t len)
{
    mp_limb_t temp;
    size_t idx;
    size_t limit;

    /* Adjust the dst and src pointers if we're not starting with word 0. */
    if (dst_off >= GMP_NUMB_BITS) {
        dst += LIMB_OF(dst_off);
        dst_off = BIT_OF(dst_off);
    }

    if (src_off >= GMP_NUMB_BITS) {
        src += LIMB_OF(src_off);
        src_off = BIT_OF(src_off);
    }

    if (len <= GMP_NUMB_BITS) {
        /* This is an uncommon case- we're short enough that we are likely
         * to fit into a small number of words.  By protecting this code
         * with a simple test, we can cheaply skip a lot of checks for
         * corner cases.
         */

        /* Early exit if no work */
        if (len == 0) {
            return;
        }

        if ((src_off + len) <= GMP_NUMB_BITS) {
            /* We only read the first word of src */
            BITMASK_LO_ASSERT(len-1);
            temp = (src[0] >> src_off) & BITMASK_LO(len-1);

            if ((dst_off + len) <= GMP_NUMB_BITS) {
                /* We only write the first word of dst */
                BITMASK_ASSERT(dst_off, dst_off + len - 1);
                dst[0] = (dst[0] & ~BITMASK(dst_off, dst_off + len - 1)) |
                         (temp << dst_off);
            }
            else {
                /* We write two words of dst */
                /* Note that len <= GMP_NUMB_BITS.  Otherwise we wouldn't be
                 * here.  The only way to get into this branch, then, is
                 * for dst_off + len > GMP_NUMB_BITS, which means dst_off > 0.
                 * So doing BITMASK_LO(dst_off-1) is safe.
                 */
                BITMASK_LO_ASSERT(dst_off - 1);
                dst[0] = (dst[0] & BITMASK_LO(dst_off - 1)) |
                         (temp << dst_off);
                BITMASK_HI_ASSERT(dst_off + len - GMP_NUMB_BITS);
                dst[1] = (dst[1] & BITMASK_HI(dst_off + len - GMP_NUMB_BITS)) |
                         (temp >> (GMP_NUMB_BITS - dst_off));
            }

            return;

        }
        else if ((dst_off + len) <= GMP_NUMB_BITS) {
            /* We only write the first word of dst */
            /* Note that we need to read exactly 2 words of src.  If we only
             * needed to read one, we'd have caught that above.  We have more
             * then GMP_NUMB_BITS worth of bits between the two words, so we
             * don't need more than 2 words of src.
             */

            temp = DBL_SHIFT(src[1], src[0], src_off);

            BITMASK_LO_ASSERT(len-1);
            temp &= BITMASK_LO(len-1);

            BITMASK_ASSERT(dst_off, dst_off + len - 1);
            dst[0] = (dst[0] & ~BITMASK(dst_off, dst_off + len - 1)) |
                     (temp << dst_off);

            return;
        }

    }

#if defined(TEST_BACKWARDS)
    if ((dst <= src) && (src < (dst + LIMB_OF(dst_off + len - 1)))) {
        ASSERT (0);
    }
#else /* TEST_BACKWARDS */
    if ((dst <= src) || (dst > (src + LIMB_OF(src_off + len - 1)))) {

        /* We can copy forward */

        /* Move just enough bits so that dst_off == 0 */
        if (dst_off > 0) {
            if (src_off != 0) {
                /* We need to read from two words */
                temp = DBL_SHIFT(src[1], src[0], src_off);
            } else {
                /* We only need to read from one word */
                temp = src[0];
            }

            /* Leave the original bits in dst untouched */
            BITMASK_LO_ASSERT(dst_off - 1);
            dst[0] = (dst[0] & BITMASK_LO(dst_off - 1)) |
                     (temp << dst_off);

            dst += 1;
            src_off += GMP_NUMB_BITS - dst_off;
            len -= GMP_NUMB_BITS - dst_off;
            if (src_off >= GMP_NUMB_BITS) {
                src_off -= GMP_NUMB_BITS;
                src += 1;
            }
        }

        /* Move the bulk of the words- this is where most of the work
         * happens.
         */
        limit = LIMB_OF(len);
        len = BIT_OF(len);

        if (src_off == 0) {
            /* If we don't need to shift, use the faster memmove.  Note
             * that we don't call memmove with size = 0.  Note that we
             * can't use memcpy, which might be faster, as the arrays may
             * still overlap, and memcpy has undefined semantics in that
             * case.  It'd *probably* work, but not definately.  So memmove
             * it is.
             */
            if (limit > 0) {
               memmove((void *) dst, (const void *) src,
                       limit * sizeof(mp_limb_t));
            }
            idx = limit;
        }
        else {
            /* The core loop.  Note that this loop may iterate 0 times. */

            if (limit < 4) {
                /* Nothing I do to try to optimize this loop makes it any
                 * faster- generally it just slows it down.
                 */
                for (idx = 0; idx < limit; ++idx) {
                    dst[idx] = DBL_SHIFT(src[idx+1], src[idx], src_off);
                }
            } else {
                /* Can't optimize the short loop */
                for (idx = 0; idx < (limit % 4); ++idx) {
                    dst[idx] = DBL_SHIFT(src[idx+1], src[idx], src_off);
                }
                idx = limit;
                __asm__ __volatile__ (
                    "0:\n\t"
                    "movl	(%%esi), %%eax\n\t"
                    "shrdl	%%cl,%%eax,%%edx\n\t"
                    "movl       %%edx, (%%edi)\n\t"
                    "movl       %%eax, %%edx\n\t"
                    "movl	4(%%esi), %%eax\n\t"
                    "shrdl	%%cl,%%eax,%%edx\n\t"
                    "movl       %%edx, 4(%%edi)\n\t"
                    "movl       %%eax, %%edx\n\t"
                    "movl	8(%%esi), %%eax\n\t"
                    "shrdl	%%cl,%%eax,%%edx\n\t"
                    "movl       %%edx, 8(%%edi)\n\t"
                    "movl       %%eax, %%edx\n\t"
                    "movl	12(%%esi), %%eax\n\t"
                    "shrdl	%%cl,%%eax,%%edx\n\t"
                    "movl       %%edx, 12(%%edi)\n\t"
                    "movl       %%eax, %%edx\n\t"
                    "addl	$16, %%esi\n\t"
                    "addl	$16, %%edi\n\t"
                    "decl	%%ebx\n\t"
                    "jnz	0b\n\t"
                    : 
                    : "b" (limit/4), "c" (src_off), "d" (src[limit%4]), 
                      "S" (src+1+(limit%4)), "D" (dst+(limit%4))
                    : "cc", "memory", "eax");
            }
        }

        /* idx = limit; */
        if (len > 0) {

            /* Deal with the last word special */
            BITMASK_LO_ASSERT(len-1);
            temp = BITMASK_LO(len-1);
            if ((src_off + len - 1) >= GMP_NUMB_BITS) {
                dst[idx] = (dst[idx] & ~temp) |
                             (DBL_SHIFT(src[idx+1], src[idx], src_off) & temp);
            } else {
                dst[idx] = (dst[idx] & ~temp) | ((src[idx] >> src_off) & temp);
            }
        }
    }
#endif /* TEST_BACKWARDS */

    else
    {
        /* We must copy backwards */

        /* First, we adjust dst_off so it is zero.  We keep dst_off the
         * number of bits we adjusted by, so we can undo the adjustment
         * later.
         */
        if (dst_off > 0) {
            dst_off = GMP_NUMB_BITS - dst_off;
            dst += 1;

            /* Now pretend we've consumed dst_off bits */
            len -= dst_off;
            src_off += dst_off;
            /* Check to see if src_off scrolled into the next word.  If so,
             * adjust.
             */
            if (src_off >= GMP_NUMB_BITS) {
                src += 1;
                src_off -= GMP_NUMB_BITS;
            }
        }

        /* Limit becomes the number of dst words we need to move in bulk,
         * while len becomes the remainder number of bits we need to move in
         * the last word.  Remember that the number of bits we need to move
         * into the first word is still stored in dst_off.
         */
        limit = LIMB_OF(len);
        len = BIT_OF(len);

        /* Deal with the last word */
        idx = limit;
        if (len > 0) {

            /* Deal with the last word special */
            BITMASK_LO_ASSERT(len-1);
            temp = BITMASK_LO(len-1);
            if ((src_off + len - 1) >= GMP_NUMB_BITS) {
                dst[idx] = (dst[idx] & ~temp) |
                             (DBL_SHIFT(src[idx+1], src[idx], src_off) & temp);
            } else {
                dst[idx] = (dst[idx] & ~temp) | ((src[idx] >> src_off) & temp);
            }
        }

        /* Move the bulk of the words- this is where most of the work
         * happens.
         */
        if (src_off == 0) {
            /* If we don't need to shift, use the faster memmove.  Note
             * that we don't call memmove with size = 0.  The arrays will
             * overlap, so we need to use memmove.
             */
            if (limit > 0) {
               memmove((void *) dst, (const void *) src,
                       limit * sizeof(mp_limb_t));
            }
        }
        else {
            /* The core loop.  Note that this loop may iterate 0 times. */
            /* We use a while loop instead of a for loop because it works
             * better.
             */
            if (limit < 4) {
                /* Nothing I do to try to optimize this loop makes it any
                 * faster- generally it just slows it down.
                 */
                idx = limit;
                while (idx > 0) {
                    idx -= 1;
                    dst[idx] = DBL_SHIFT(src[idx+1], src[idx], src_off);
                }
            } else {
                /* Can't optimize the short loop */
                idx = limit;
                while ((limit % 4) != 0) {
                    limit -= 1;
                    dst[limit] = DBL_SHIFT(src[limit+1], src[limit], src_off);
                }
                __asm__ __volatile__ (
                    "0:\n\t"
                    "movl	(%%esi), %%eax\n\t"
                    "shldl	%%cl,%%eax,%%edx\n\t"
                    "movl	%%edx, (%%edi)\n\t"
                    "movl	%%eax, %%edx\n\t"
                    "movl	-4(%%esi), %%eax\n\t"
                    "shldl	%%cl,%%eax,%%edx\n\t"
                    "movl	%%edx, -4(%%edi)\n\t"
                    "movl	%%eax, %%edx\n\t"
                    "movl	-8(%%esi), %%eax\n\t"
                    "shldl	%%cl,%%eax,%%edx\n\t"
                    "movl	%%edx, -8(%%edi)\n\t"
                    "movl	%%eax, %%edx\n\t"
                    "movl	-12(%%esi), %%eax\n\t"
                    "shldl	%%cl,%%eax,%%edx\n\t"
                    "movl	%%edx, -12(%%edi)\n\t"
                    "movl	%%eax, %%edx\n\t"
                    "subl	$16,%%esi\n\t"
                    "subl	$16,%%edi\n\t"
                    "decl	%%ebx\n\t"
                    "jnz	0b\n\t"
                    : 
                    : "b" (limit/4), "c" (GMP_NUMB_BITS - src_off), 
                      "d" (src[limit]), 
                      "S" (src + limit - 1), 
                      "D" (dst + limit - 1)
                    : "cc", "memory", "eax");
            }
        }

        /* Unadjust dst_off so we can deal with the first word properly */
        if (dst_off > 0) {
            /* Note that the only way src_off can be less than dst_off is
             * if it scrolled.  If it scrolled, we need to unadjust the src
             * pointer as well.
             */
            if (src_off >= dst_off) {
                src_off -= dst_off;
            } else {
                src -= 1;
                src_off += GMP_NUMB_BITS - dst_off;
            }

            /* The length, the number of bits we need to copy, is just dst_len.
             */
            len = dst_off;
            dst_off = GMP_NUMB_BITS - dst_off;
            dst -= 1;

            if (src_off != 0) {
                /* We need to read from two words */
                temp = DBL_SHIFT(src[1], src[0], src_off);
            } else {
                /* We only need to read from one word */
                temp = src[0];
            }

            /* Leave the original bits in dst untouched */
            BITMASK_LO_ASSERT(dst_off - 1);
            dst[0] = (dst[0] & BITMASK_LO(dst_off - 1)) |
                     (temp << dst_off);

        }

    }

}