glide/swlibs/texus2/lib/codec.c

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include "texusint.h"
#include "sst2fxt1.h"

#define SQUARED(x)  ((x)*(x))
#define ABS(x)      (((x) < 0) ? -(x) : (x))

int     globalX, globalY;
int     a_color_cc  = -1; // force color coding
int     a_force_cc  = -1; // force color mode
int     a_tolerance = 0;
int     a_lerp = 0; // force interpolation in alpha mode


#define static

static int
bestColor(
    const float  *a,
    const float  codebook[][3],
    int          codesize)
{
    int     i;
    int     bestindex;
    float   bestdist;
    float   dist[8];

    for (i=0; i < codesize; i++) {
        dist[i] = SQUARED(a[0] - codebook[i][0]) +
                  SQUARED(a[1] - codebook[i][1]) +
                  SQUARED(a[2] - codebook[i][2]) ;
    }
    for ( i=1, bestindex=0, bestdist = dist[0];
          i < codesize;
          i++) {
        if (dist[i] < bestdist) {
            bestdist  = dist[i];
            bestindex = i;
        }
    }
    return bestindex;
}

static int
bestColorError(
    const float  *a,
    const float  codebook[][3],
    int          codesize,
    float        *error)     // RETURN
{
    int     i;
    int     bestindex;
    float   bestdist;
    float   dist[8];

    for (i=0; i < codesize; i++) {
        dist[i] = SQUARED(a[0] - codebook[i][0]) +
                  SQUARED(a[1] - codebook[i][1]) +
                  SQUARED(a[2] - codebook[i][2]) ;
    }
    for ( i=1, bestindex=0, bestdist = dist[0];
          i < codesize;
          i++) {
        if (dist[i] < bestdist) {
            bestdist  = dist[i];
            bestindex = i;
        }
    }
    *error = bestdist;
    return bestindex;
}

// Usable only for the interpolation compression modes.
// XXX Susceptible to rounding errors?
static int
bestColorRGBInterp(
    const float  a[3],
    const float  iv[3],
    const float  b,
    const int    codesize)
{
    int t = (int)((a[0]*iv[0] + a[1]*iv[1] + a[2]*iv[2]) + b);
    if ( t <= 0)
        return 0;
    else if ( t >= codesize)
        return codesize-1;
    else
        return t;
}

static int
bestColorAlpha(
    const float  *c,
    const float   a,
    const float  codebook[][4],
    const int    codesize,
    const FxU32  lerp)
{
    int     i;
    int     bestindex = -1;
    float   bestdist  = 1.0e30F;
    float   d0, d1, d2, dist;

    if (!lerp && ( c[0] == 0.0f ) && ( c[1] == 0.0f ) &&  ( c[2] == 0.0f ) && ( a == 0.0f ))
        return 3;

    for (i=0; i < codesize; i++) {
        if ( a_lerp ) {
            d0 = SQUARED(c[0]*a - codebook[i][0]*codebook[i][3]);
            d1 = SQUARED(c[1]*a - codebook[i][1]*codebook[i][3]);
            d2 = SQUARED(c[2]*a - codebook[i][2]*codebook[i][3]);

            dist = SQUARED(c[0]*a - codebook[i][0]*codebook[i][3]) +
                   SQUARED(c[1]*a - codebook[i][1]*codebook[i][3]) +
                   SQUARED(c[2]*a - codebook[i][2]*codebook[i][3]);
            dist = d0 + d1 + d2;
        } else {
            dist = SQUARED(c[0] - codebook[i][0]) +
                   SQUARED(c[1] - codebook[i][1]) +
                   SQUARED(c[2] - codebook[i][2]) +
                   SQUARED(a - codebook[i][3]);
        }
        if (dist < bestdist) {
            bestdist  = dist;
            bestindex = i;
        }
    }
    if (( c[0] < 10.f ) && ( codebook[bestindex][0] > 50.f ))
        return bestindex;

    return bestindex;
}

/* Given either end points for the 2 colors, generate intermediate colors */
// XXX The output fpal's should be quantized to 555, except that the newer, faster,
//     bestColorRGBInterp() cannot easily use them.
static void
makePalette( FxU32 lo, FxU32 hi, int nlevels,
             float fpal[][3])          // output, range [0.5f,255.5f], quantized to Z+0.5f
{
    int rlo, glo, blo, alo, rhi, ghi, bhi, ahi, r, g, b, a, i;

    assert((nlevels == 7) || (nlevels == 4) || (nlevels == 3));

    alo = ALF(lo);
    rlo = RED(lo);
    glo = GRN(lo);
    blo = BLU(lo);

    ahi = ALF(hi);
    rhi = RED(hi);
    ghi = GRN(hi);
    bhi = BLU(hi);

    for (i=0; i < nlevels; i++) {
        a = alo + ((ahi - alo) * i) / (nlevels - 1);
        r = rlo + ((rhi - rlo) * i) / (nlevels - 1);
        g = glo + ((ghi - glo) * i) / (nlevels - 1);
        b = blo + ((bhi - blo) * i) / (nlevels - 1);

        /* make sure all values are within 0..255 */
        assert( ((a & ~0xff) == 0) && ((r & ~0xff) == 0) &&
                ((g & ~0xff) == 0) && ((b & ~0xff) == 0) );

        fpal[i][0] = (float)r + 0.5f;  // map to [0.5f,255.5f]
        fpal[i][1] = (float)g + 0.5f;
        fpal[i][2] = (float)b + 0.5f;
    }
}

// Returns a vector 'iv' that when dotted with a color and added to an offset 'b',
// finds the index of the Cartesian-nearest color in the (linear) palette.
//
// Imagine the set of ncolors-1 planes in color space, each normal to the line through
// the color palette, that partition color space into slab-shaped sets of points, each slab
// belonging to a particular palette entry.  This procedure implements the first phase
// of that mapping by reducing the color palette to a vector 'iv' and offset 'b' for
// later use by bestColorRGBInterp().
static void makeInterpVector( float p[8][3],  // 8 is an upper bound by the design of FXT1
                              int ncolors,
                              float iv[3],    // RETURN
                              float *b)       // RETURN
{
    float d2 = 0.0f;
    float rd2;
    int i;

    for ( i=0; i<3; i++) {
        iv[i] = p[ncolors-1][i] - p[0][i];  // vector between extrema of palette; may be zero
        d2 += iv[i]*iv[i];                  // accumulate square of Cartesian distance
    }
    rd2 = (float)(ncolors-1) / d2;          // if all iv[] are 0, rd2 is Infinity
    *b = (float)0.0;
    for ( i=0; i<3; i++) {
        *b -= iv[i]*p[0][i];
        iv[i] *= rd2;                       // if rd2 is Infinity and iv[i] was 0, result is NaN
    }
    *b = *b * rd2 + 0.5f;
}

/* Similar to makePalette, but in addition Alpha component of fpal is computed. */
static void
makePaletteAlpha( FxU32 lo, FxU32 hi, int nlevels,
                  float fpal[][4])          // output, range [0.5f,255.5f], quantized to Z+0.5f
{
    int rlo, glo, blo, alo, rhi, ghi, bhi, ahi, r, g, b, a, i;

    assert((nlevels == 7) || (nlevels == 4) || (nlevels == 3));

    alo = ALF(lo);
    rlo = RED(lo);
    glo = GRN(lo);
    blo = BLU(lo);

    ahi = ALF(hi);
    rhi = RED(hi);
    ghi = GRN(hi);
    bhi = BLU(hi);

    for (i=0; i < nlevels; i++) {
        a = alo + ((ahi - alo) * i) / (nlevels - 1);
        r = rlo + ((rhi - rlo) * i) / (nlevels - 1);
        g = glo + ((ghi - glo) * i) / (nlevels - 1);
        b = blo + ((bhi - blo) * i) / (nlevels - 1);

        /* make sure all values are within 0..255 */
        assert( ((a & ~0xff) == 0) && ((r & ~0xff) == 0) &&
                ((g & ~0xff) == 0) && ((b & ~0xff) == 0) );

        fpal[i][0] = (float)r + 0.5f;    // map to [0.5f,255.5f]
        fpal[i][1] = (float)g + 0.5f;
        fpal[i][2] = (float)b + 0.5f;
        fpal[i][3] = (float)a + 0.5f;
    }
}

/* rgb5555 to 8888 by msb replication */
static FxU32
argb8888( FxU32 rgb5555 )
{
    FxU32 a = (rgb5555 >> 15) & 0x1f;
    FxU32 r = (rgb5555 >> 10) & 0x1f;
    FxU32 g = (rgb5555 >>  5) & 0x1f;
    FxU32 b = (rgb5555      ) & 0x1f;

    a = (a << 3) | (a >> 2);
    r = (r << 3) | (r >> 2);
    g = (g << 3) | (g >> 2);
    b = (b << 3) | (b >> 2);

    return ARGB( a, r, g, b);
}

/* rgb555 to 888 by msb replication */
static FxU32
rgb888( FxU32 rgb555 )
{
    FxU32 r = (rgb555 >> 10) & 0x1f;
    FxU32 g = (rgb555 >>  5) & 0x1f;
    FxU32 b = (rgb555      ) & 0x1f;

    r = (r << 3) | (r >> 2);
    g = (g << 3) | (g >> 2);
    b = (b << 3) | (b >> 2);

    return ARGB( 0xff, r, g, b);
}

/* rgb565 to 888 by msb replication */
static FxU32
rgb565_888( FxU32 rgb565 )
{
    FxU32 r = (rgb565 >> 11) & 0x1f;
    FxU32 g = (rgb565 >>  5) & 0x3f;
    FxU32 b = (rgb565      ) & 0x1f;

    r = (r << 3) | (r >> 2);
    g = (g << 2) | (g >> 4);
    b = (b << 3) | (b >> 2);

    return ARGB( 0xff, r, g, b);
}

// Simply truncate, for symmetry with promotion by replication.
/* rgb888 to 565 */
static FxU32
rgb565( FxU32 rgb888 )
{
    FxU32 r = (RED(rgb888)) >> 3;
    FxU32 g = (GRN(rgb888)) >> 2;
    FxU32 b = (BLU(rgb888)) >> 3;

    return (r << 11) | (g << 5) | b;
}

/* argb8888 to 5555 */
static FxU32
argb5555( FxU32 argb8888 )
{
    FxU32 a = (ALF(argb8888)) >> 3;
    FxU32 r = (RED(argb8888)) >> 3;
    FxU32 g = (GRN(argb8888)) >> 3;
    FxU32 b = (BLU(argb8888)) >> 3;

    return (a << 15 ) | (r << 10) | (g << 5) | b;
}

/* rgb888 to 555 */
static FxU32
rgb555( FxU32 rgb888 )
{
    FxU32 r = (RED(rgb888)) >> 3;
    FxU32 g = (GRN(rgb888)) >> 3;
    FxU32 b = (BLU(rgb888)) >> 3;

    return (r << 10) | (g << 5) | b;
}
/*
 * The eigen vector generated may sometimes have endpoints that are outside
 * the rgb color space. We clip it along the line, and move endpoints within
 * the color space.
 */
void
clipLine(float lo[3], float hi[3])
{
    int i;
#if 0
    int j;
    int cclo, cchi;
    int swapped = 0;

again:
    cclo = 0;
    cchi = 0;

    for (i=0; i<3; i++) {
	  if (lo[i] <   0.0f) cclo |= (1 <<     i);
	  if (hi[i] <   0.0f) cchi |= (1 <<     i);

	  if (lo[i] > 255.9999f) cclo |= (1 << (3+i));
	  if (hi[i] > 255.9999f) cchi |= (1 << (3+i));
    }

    if (cclo & cchi) {
	  // trivial reject. Bad news.
#if PRINT
	  fprintf(stderr, "\nBad   : [%4.0f %4.0f %4.0f][%4.0f %4.0f %4.0f]\n",
	      lo[0], lo[1], lo[2], hi[0], hi[1], hi[2]);
#endif

	  // Try to fix it directly by clamping (Really bad, this)
	  if ((cclo & cchi) & 0x01) lo[0] = hi[0] =   0.0f;
	  if ((cclo & cchi) & 0x02) lo[1] = hi[1] =   0.0f;
	  if ((cclo & cchi) & 0x04) lo[2] = hi[2] =   0.0f;

	  if ((cclo & cchi) & 0x08) lo[0] = hi[0] = 255.9999f;
	  if ((cclo & cchi) & 0x10) lo[1] = hi[1] = 255.9999f;
	  if ((cclo & cchi) & 0x20) lo[2] = hi[2] = 255.9999f;

#if PRINT
	  fprintf(stderr, "\nFixed : [%4.0f %4.0f %4.0f][%4.0f %4.0f %4.0f]\n",
	      lo[0], lo[1], lo[2], hi[0], hi[1], hi[2]);
#endif

    } else if ((cclo | cchi) == 0){
	  // trivial accept
	  return;
    }

#if PRINT
    fprintf(stderr, "ClipIn : [%8.4f %8.4f %8.4f] - [%8.4f %8.4f %8.4f]\n",
	  lo[0], lo[1], lo[2], hi[0], hi[1], hi[2]);
#endif
    for (i=0; i<3; i++) {
	  float   t;

	  // Travel towards the center, shortening all coordinates.
	  if (lo[i] <   0.0f) {
	      t = (  0.0f - hi[i]) / (lo[i] - hi[i]);
	  } else if (lo[i] > 255.9999f) {
	      t = (255.9999f - hi[i]) / (lo[i] - hi[i]);
	  }
	  else continue;

	  // Shorten all coordinates by this amount.
	  for (j=0; j<3; j++) {
	      lo[j] = hi[j] + (lo[j] - hi[j]) * t;
	  }

	  // Account for round-off errors.
	  // if (lo[i] < 0.0f) lo[i] = 0.0f;
	  // else if (lo[i] > 255.0f) lo[i] = 255.0f;

#if PRINT
	  fprintf(stderr, "ClipMid: [%8.4f %8.4f %8.4f] - [%8.4f %8.4f %8.4f]\n",
	      lo[0], lo[1], lo[2], hi[0], hi[1], hi[2]);
#endif
    }

#if PRINT
    fprintf(stderr, "ClipOut: [%8.4f %8.4f %8.4f] - [%8.4f %8.4f %8.4f]\n",
	  lo[0], lo[1], lo[2], hi[0], hi[1], hi[2]);
    fflush(stderr);
#endif
    // There might be some roundoff errors, so we fudge.
    for (i=0; i<3; i++) {
	  if ((lo[i] <   0.0f) /* && (lo[i] >   -2.0f)*/) lo[i] = 0.0f;
	  if ((lo[i] > 255.9999f) /* && (lo[i] <  257.0f)*/) lo[i] = 255.9999f;

	  if ((lo[i] < 0.0f) || (lo[i] > 255.9999f)) {
	      fprintf(stderr, "\n Bad color: %4.0f %4.0f %4.0f\n",
		  lo[0], lo[1], lo[2]);
	  }
    }
    if (!swapped) {
	  // reverse end points and do it again.
	  float  *tmp;
	  swapped = 1;
	  tmp = lo; lo = hi; hi = tmp;
	  goto again;
    }
#else // 1
    for (i=0; i<3; i++) {
        if ((lo[i] <   0.5f))
            lo[i] = 0.5f;
        if ((lo[i] > 255.5f))
            lo[i] = 255.5f;
        if ((hi[i] <   0.5f))
            hi[i] = 0.5f;
        if ((hi[i] > 255.5f))
            hi[i] = 255.5f;
    }
#endif // 1
}

/*
 * Given that lo and hi differ by less than 16 on all 3 coords, encode it
 * as a midpoint color at 666 resolution, plus a 12 bit signed delta.
 */
static FxU32
encodeDelta( float c0[3], float c1[3])
{
    int         r, g, b, dr, dg, db;

#if 0
    printf("Encode Colors: [%3d %3d %3d] - [%3d %3d %3d] at %3d %3d\n",
        (int) c0[0], (int) c0[1], (int) c0[2],
        (int) c1[0], (int) c1[1], (int) c1[2], globalX, globalY);
#endif

    r = (int) ((c0[0] + c1[0]) * 0.5f);
    g = (int) ((c0[1] + c1[1]) * 0.5f);
    b = (int) ((c0[2] + c1[2]) * 0.5f);


    /* round to rgb666 and back to 888 */
    r = (r + 2) >> 2;
    g = (g + 2) >> 2;
    b = (b + 2) >> 2;

    if (r > 0x3f) r = 0x3f;
    if (g > 0x3f) g = 0x3f;
    if (b > 0x3f) b = 0x3f;

    r <<= 2;
    g <<= 2;
    b <<= 2;

    /* Generate half the delta value */
    dr = (int) ((c0[0] - c1[0]) * 0.5f);
    dg = (int) ((c0[1] - c1[1]) * 0.5f);
    db = (int) ((c0[2] - c1[2]) * 0.5f);

    /* Ensure it's within -8 to +7 */
    if (dr < -8) dr = -8;
    if (dg < -8) dg = -8;
    if (db < -8) db = -8;

    if (dr >  7) dr = 7;
    if (dg >  7) dg = 7;
    if (db >  7) db = 7;

    if (dr < 0) {
        if ((r + dr) <   0) dr =   0 -   r;
        if ((r - dr) > 255) dr =   r - 255;
    } else {
        if ((r - dr) <   0) dr =   r -   0;
        if ((r + dr) > 255) dr = 255 -   r;
    }

    if (dg < 0) {
        if ((g + dg) <   0) dg =   0 -   g;
        if ((g - dg) > 255) dg =   g - 255;
    } else {
        if ((g - dg) <   0) dg =   g -   0;
        if ((g + dg) > 255) dg = 255 -   g;
    }

    if (db < 0) {
        if ((b + db) <   0) db =   0 -   b;
        if ((b - db) > 255) db =   b - 255;
    } else {
        if ((b - db) <   0) db =   b -   0;
        if ((b + db) > 255) db = 255 -   b;
    }

    // printf("Mid pts = [%3d %3d %3d]\n", r, g, b);
    // printf("deltas = %d %d %d\n", dr, dg, db);

    /* So here's the new c0 and c1 values you would use for the palette */
    c0[0] = (float) (r - dr);
    c0[1] = (float) (g - dg);
    c0[2] = (float) (b - db);

    c1[0] = (float) (r + dr);
    c1[1] = (float) (g + dg);
    c1[2] = (float) (b + db);

#if 0
    printf("New endpts: [%3d %3d %3d] - [%3d %3d %3d]\n",
        (int) c0[0], (int) c0[1], (int) c0[2],
        (int) c1[0], (int) c1[1], (int) c1[2]);
#endif

    fflush(stderr);


    assert((dr >= -8) && (dr <= 7) &&
           (dg >= -8) && (dg <= 7) &&
           (db >= -8) && (db <= 7));

    assert((c0[0] >= 0.0f) && (c0[1] >= 0.0f) && (c0[2] >= 0.0f));
    assert((c1[0] >= 0.0f) && (c1[1] >= 0.0f) && (c1[2] >= 0.0f));

    assert((c0[0] <= 255.9999f) && (c0[1] <= 255.9999f) && (c0[2] <= 255.9999f));
    assert((c1[0] <= 255.9999f) && (c1[1] <= 255.9999f) && (c1[2] <= 255.9999f));

    /* This will be encoded as an rgb666 + drgb444 */
    r >>= 2;
    g >>= 2;
    b >>= 2;
    r = (r << 12) | (g << 6) | b;
    dr = ((dr & 0xf) << 8) | ((dg & 0xf) << 4) | ((db & 0xf));

    return (r << 12) | dr | (0x1 << 30);        // the delta mode bit  is 30
}

static void
decodeDelta( FxU32 col, FxU32 *lo, FxU32 *hi )
{
    int r, g, b, dr, dg, db;
    int rlo, glo, blo, rhi, ghi, bhi;

    db = col & 0x0f; col >>= 4;
    dg = col & 0x0f; col >>= 4;
    dr = col & 0x0f; col >>= 4;
    b  = col & 0x3f; col >>= 6;
    g  = col & 0x3f; col >>= 6;
    r  = col & 0x3f; col >>= 6;

    /* sign extend the deltas */
    if (dr & 8) dr |= 0xfffffff0;
    if (dg & 8) dg |= 0xfffffff0;
    if (db & 8) db |= 0xfffffff0;

    /* make rgb666 to 888 */
    r <<= 2;
    g <<= 2;
    b <<= 2;

    rlo = r - dr;
    glo = g - dg;
    blo = b - db;

    rhi = r + dr;
    ghi = g + dg;
    bhi = b + db;

    // printf("Decode: mid = [%3d %3d %3d] deltas = [%3d %3d %3d] at %3d %3d\n",
    //     r, g, b, dr, dg, db, globalX, globalY );
    // printf("Decode delta: [%3d %3d %3d] - [%3d %3d %3d]\n",
    //     rlo, glo, blo, rhi, ghi, bhi );
    // fflush(stderr);

    assert((rlo >=   0) && (glo >=   0) && (blo >=   0));
    assert((rlo <= 255) && (glo <= 255) && (blo <= 255));

    assert((rhi >=   0) && (ghi >=   0) && (bhi >=   0));
    assert((rhi <= 255) && (ghi <= 255) && (bhi <= 255));

    *lo = ARGB( 255, rlo, glo, blo);
    *hi = ARGB( 255, rhi, ghi, bhi);
}

// XXXdwm  Vtune says __ftol takes some 5% of the time.  Could __ftol's below be batched somehow?
static void
encodeColors(int mode, int mixmode, int alpha, float c0[3], float c1[3], float c2[3], float c3[3],
             float input[][3], FxI32 ainput[], void *bits)
{
    int         i, sel, index[32];
    FxU32       lo, hi, col[4];
    float       fpal[8][3];
    float       iv[3];
    float       b;

    switch(mode) {
    case TCC_HI:
        clipLine(c0, c1);
        lo = ARGB( 255, (int) c0[0], (int) c0[1], (int) c0[2]);
        hi = ARGB( 255, (int) c1[0], (int) c1[1], (int) c1[2]);

        col[0] = rgb555( lo );
        col[1] = rgb555( hi );

        lo = rgb888( col[0] );
        hi = rgb888( col[1] );
        makePalette( lo, hi, 7, fpal);
        makeInterpVector( fpal, 7, iv, &b);

        /* Map input colors to closest entry in the palette */
        for (i=0; i<32; i++) {
            if ( alpha && ( ainput[i] == 0 ))
                 index[i] = 7;
            else
#define RGB_INTERP 1
#if ! RGB_INTERP
                 index[i] = bestColor((float *) &input[i][0], fpal, 7);
#else
                 index[i] = bestColorRGBInterp( (float *)&input[i][0], iv, b, 7);
#endif
        }

        /* Now encode these into the 128 bits */
        bitEncoder( mode, col, alpha, index, bits);
        break;

    case TCC_MIXED:
        clipLine(c0, c1);
        clipLine(c2, c3);

        /* Deal with even block */
        lo = ARGB( 255, (int) c0[0], (int) c0[1], (int) c0[2]);
        hi = ARGB( 255, (int) c1[0], (int) c1[1], (int) c1[2]);
        if (alpha) {
            col[0] = rgb555(lo);
            col[1] = rgb555(hi);
        } else {
            col[0] = rgb565(lo);
            col[1] = rgb565(hi);
        }
        makePalette( lo, hi, alpha ? 3 : 4, fpal);
        makeInterpVector( fpal, alpha ? 3 : 4, iv, &b);

        /* Map input colors to closest entry in the palette */
        for (i=0; i<16; i++) {
            if ( alpha && ( ainput[i] == 0 ))
                 index[i] = 3;
            else
#if ! RGB_INTERP
                 index[i] = bestColor((float *) &input[i][0], fpal, alpha ? 3 : 4);
#else
                 index[i] = bestColorRGBInterp( (float *)&input[i][0], iv, b, alpha ? 3 : 4);
#endif
        }

        sel = alpha;

        // funky encoding for lsb of green
        if (!alpha) {
           if (( (FxU32)index[0] >> 1 ) != (( (col[0]>>5) & 0x1 ) ^ ( (col[1]>>5) & 0x1 )) ) {
               FxU32 tmp = col[0];
               col[0] = col[1];
               col[1] = tmp;
               for (i=0; i<16; i++) {
                 index[i] ^= 3;
               }
           }
           sel |= ( (col[1]>>5) & 0x1 )<<1;
           // remove lsb of green
           col[0] = ((col[0] & 0xFFC0) >> 1) | (col[0] & 0x1F);
           col[1] = ((col[1] & 0xFFC0) >> 1) | (col[1] & 0x1F);
        }

        /* Now deal with odd block */
        lo = ARGB( 255, (int) c2[0], (int) c2[1], (int) c2[2]);
        hi = ARGB( 255, (int) c3[0], (int) c3[1], (int) c3[2]);
        if (alpha) {
            col[2] = rgb555(lo);
            col[3] = rgb555(hi);
        } else {
            col[2] = rgb565(lo);
            col[3] = rgb565(hi);
        }
        makePalette( lo, hi, alpha ? 3 : 4, fpal);
        makeInterpVector( fpal, alpha ? 3 : 4, iv, &b);

        /* Map input colors to closest entry in the palette */
        for (i=16; i<32; i++) {
            if ( alpha && ( ainput[i] == 0 ))
                 index[i] = 3;
            else
#if ! RGB_INTERP
                 index[i] = bestColor((float *) &input[i][0], fpal, alpha ? 3 : 4);
#else
                 index[i] = bestColorRGBInterp( (float *)&input[i][0], iv, b, alpha ? 3 : 4);
#endif
        }

        // funky encoding for lsb of green
        if (!alpha) {
           if (( (FxU32)index[16] >> 1 ) != (( (col[2]>>5) & 0x1 ) ^ ( (col[3]>>5) & 0x1 )) ) {
               FxU32 tmp = col[2];
               col[2] = col[3];
               col[3] = tmp;
               for (i=16; i<32; i++) {
                 index[i] ^= 3;
               }
           }
           sel |= ( (col[3]>>5) & 0x1 )<<2;
           // remove lsb of green
           col[2] = ((col[2] & 0xFFC0) >> 1) | (col[2] & 0x1F);
           col[3] = ((col[3] & 0xFFC0) >> 1) | (col[3] & 0x1F);
        }

        /* Now encode these into the 128 bits */
        bitEncoder( mode, col, sel, index, bits);
        break;

    case TCC_CHROMA:
        /* map float palette colors to int by truncation */
        col[0] = ARGB( 255, (int) c0[0], (int) c0[1], (int) c0[2]);
        col[1] = ARGB( 255, (int) c1[0], (int) c1[1], (int) c1[2]);
        col[2] = ARGB( 255, (int) c2[0], (int) c2[1], (int) c2[2]);
        col[3] = ARGB( 255, (int) c3[0], (int) c3[1], (int) c3[2]);

        // produce floats quantized to 555
        for (i=0; i < 4; i++) {
            int rgb;

            col[i] = rgb555( col[i] );
            rgb = rgb888( col[i] );

            /* map to float [0.5,255.5], so as to agree with range of input[][] */
            fpal[i][0] = (float) RED(rgb) + 0.5f;
            fpal[i][1] = (float) GRN(rgb) + 0.5f;
            fpal[i][2] = (float) BLU(rgb) + 0.5f;
        }

        /* Map input colors to closest entry in the palette */
        for (i=0; i<32; i++) {
            index[i] = bestColor(&input[i][0], (const float (*)[3])fpal, 4);
        }

        /* Now encode these into the 128 bits */
        bitEncoder( mode, col, alpha, index, bits);
        break;


    default:
#if PRINT
        printf("NYI in encodeColors\n");
#endif
        exit(0);
    }
}

static void
decodeColors( void *bits, float output[][4] )
{
    int         i, mode, index[32];
    FxU32       col[4], lo, hi;
    float       fpal[8][3];
    FxU32       alpha, glsb;

   mode = bitDecoder( bits, col, index, &alpha);
    switch(mode) {
    case TCC_HI:
        lo = rgb888(col[0]);
        hi = rgb888(col[1]);
        makePalette(lo, hi, 7, fpal);

        for (i=0; i<32; i++) {
            int j = index[i];
            if ( j == 7 ) {
                output[i][0] =
                output[i][1] =
                output[i][2] =
                output[i][3] = 0.0f;
            } else {
                output[i][0] = 255.0f;
                output[i][1] = fpal[j][0];
                output[i][2] = fpal[j][1];
                output[i][3] = fpal[j][2];
           }
        }
        break;

    case TCC_MIXED:
        glsb = alpha >> 1;
        alpha &= 0x1;
        if ( alpha ) {
            lo = rgb888( col[0] );
            hi = rgb888( col[1] );
        } else {
            // compute 565 colors
            col[0] = (( col[0] & 0x7fe0 ) << 1 ) | ( col[0] & 0x1f ) |
                     (((index[0]>> 1) ^ ( glsb & 0x1)) << 5);
            col[1] = (( col[1] & 0x7fe0 ) << 1 ) | ( col[1] & 0x1f ) |
                     (( glsb & 0x1) << 5);
            lo = rgb565_888( col[0] );
            hi = rgb565_888( col[1] );
        }
        makePalette(lo, hi, alpha ? 3 : 4, fpal);
        for (i=0; i<16; i++) {
            int j = index[i];
            if ( alpha && ( j == 3 )) {
                output[i][0] =
                output[i][1] =
                output[i][2] =
                output[i][3] = 0.0f;
            } else {
                output[i][0] = 255.0f;
                output[i][1] = fpal[j][0];
                output[i][2] = fpal[j][1];
                output[i][3] = fpal[j][2];
            }
        }
        if ( alpha ) {
            lo = rgb888( col[2] );
            hi = rgb888( col[3] );
        } else {
            // compute 565 colors
            col[2] = (( col[2] & 0x7fe0 ) << 1 ) | ( col[2] & 0x1f ) |
                     (((index[16]>> 1) ^ ( glsb >> 1)) << 5);
            col[3] = (( col[3] & 0x7fe0 ) << 1 ) | ( col[3] & 0x1f ) |
                     (( glsb >> 1) << 5);
            lo = rgb565_888( col[2] );
            hi = rgb565_888( col[3] );
        }
        makePalette(lo, hi, alpha ? 3 : 4, fpal);
        for (i=16; i<32; i++) {
            int j;

            j = index[i];
            if ( alpha && ( j == 3 )) {
                output[i][0] =
                output[i][1] =
                output[i][2] =
                output[i][3] = 0.0f;
            } else {
                output[i][0] = 255.0f;
                output[i][1] = fpal[j][0];
                output[i][2] = fpal[j][1];
                output[i][3] = fpal[j][2];
            }
        }
        break;

    case TCC_CHROMA:
        for (i=0; i<4; i++) {
            int rgb;

            rgb = rgb888( col[i] );
            fpal[i][0] = (float) RED(rgb);
            fpal[i][1] = (float) GRN(rgb);
            fpal[i][2] = (float) BLU(rgb);
        }
        for (i=0; i<32; i++) {
            int j = index[i];
            output[i][0] = 255.0f;
            output[i][1] = fpal[j][0];
            output[i][2] = fpal[j][1];
            output[i][3] = fpal[j][2];
        }
         break;

    case TCC_ALPHA:
        if ( alpha ) { // interpolate colors
            float       fpal[4][4];

            lo = argb8888( col[0] );
            hi = argb8888( col[1] );
            makePaletteAlpha(lo, hi, 4, fpal);
            for (i=0; i<16; i++) {
                int j;

                j = index[i];
                output[i][0] = fpal[j][0];
                output[i][1] = fpal[j][1];
                output[i][2] = fpal[j][2];
                output[i][3] = fpal[j][3];
            }

            lo = argb8888( col[2] );
            hi = argb8888( col[1] );
            makePaletteAlpha(lo, hi, 4, fpal);
            for (i=16; i<32; i++) {
                int j;

                j = index[i];
                output[i][0] = fpal[j][0];
                output[i][1] = fpal[j][1];
                output[i][2] = fpal[j][2];
                output[i][3] = fpal[j][3];
            }
        } else { // no interpolation use colors as they are index 3 = transparent black
            FxU32 p[4];
            p[0] = argb8888( col[0] );
            p[1] = argb8888( col[1] );
            p[2] = argb8888( col[2] );
            p[3] = 0; // transparent black
            for (i=0; i<32; i++) {
                int j = index[i];

                output[i][0] = (float)ALF(p[j]);
                output[i][1] = (float)RED(p[j]);
                output[i][2] = (float)GRN(p[j]);
                output[i][3] = (float)BLU(p[j]);
            }

        }
        break;

    default:
#if PRINT
        fprintf(stderr, "NYI in decodeColors\n");
#endif
        exit(0);
    }

    if (a_color_cc == -1) return;                          // no color coding.
    if ((a_color_cc != 4) && (a_color_cc != mode)) return; // not this block

    // Do color coding.
    {
        float r, g, b;

        if (mode == TCC_HI) {
            r = 255.0f; g = 255.0f; b = 0.0f;           // yellow
        } else if (mode == TCC_CHROMA) {
            r = 255.0f; g = 0.0f; b = 0.0f;             // red
        } else if (mode == TCC_ALPHA) {
            r = 255.0f; g = 0.0f; b = 255.0f;           // magenta
        } else {
            // mixed.
            i = 0;
            if ((col[0] >> 30) & 1) i++;
            if ((col[1] >> 30) & 1) i++;

            if (i == 0) {
	        r = 0.0f; g = 0.0f; b = 255.0f; }       // blue
            else if (i == 1) {
                r = 0.0f; g = 255.0f; b = 255.0f; }     // cyan
            else {
                r = 0.0f; g = 255.0f; b = 0.0f; }       // green
        }

        output[1*8+3][1] = r;
        output[1*8+3][2] = g;
        output[1*8+3][3] = b;
        output[2*8+4][1] = r;
        output[2*8+4][2] = g;
        output[2*8+4][3] = b;
    }
}

#define STATISTICS 0
#if STATISTICS
static int nvqChroma;
static int nvqChroma_outer[MAX_REPEAT+1];
static int nvqChroma_inner[50+1];
#endif

#define NCOLORS  4
#define VQCHROMA_ERR_TARGET 256.0f  // Greater == faster.
                                    // Set by gathering statistics from Q3.
#define MAX_REPEAT 10

static void
vqChroma(const float in[][3], int ncolors, float colors[][3])
{
    float   input[32][3];
    float   sums[NCOLORS][3];
    float   errors[NCOLORS];  // XXX never read!
    float   counts[NCOLORS];
    float   best[NCOLORS][3];
    float   besterr = 1.0e20f;  // infinity
    float   lasterr = 1.0e20f;

    float   oo8 = 1.0f/8.0f;
    float   err = 0.0f;
    int     i, j, k;
    int     repeat = MAX_REPEAT;        // iteration limit on outermost loop

#if STATISTICS
    nvqChroma++;
#endif

    if ( ncolors > NCOLORS ) // check we have enough space
        txError("FXT1 vqChroma: invalid number of colors\n");

    // Copy input colors, chopping down to 555
    // XXXdwm  ... but they're all floats!?!  Looks like the only effect of
    //         this is on the magnitudes of the constants that are tested against.
    for (i=0; i<32; i++) {
        input[i][0] = in[i][0] * oo8;
        input[i][1] = in[i][1] * oo8;
        input[i][2] = in[i][2] * oo8;
    }

    // Select ncolors initial colors from a grid
    colors[0][0] = input[ 0][0];
    colors[0][1] = input[ 0][1];
    colors[0][2] = input[ 0][2];
    colors[1][0] = input[10][0];
    colors[1][1] = input[10][1];
    colors[1][2] = input[10][2];
    colors[2][0] = input[16][0];
    colors[2][1] = input[16][1];
    colors[2][2] = input[16][2];
    colors[3][0] = input[26][0];  /* wasted if ncolors < 4 */
    colors[3][1] = input[26][1];  /* wasted if ncolors < 4 */
    colors[3][2] = input[26][2];  /* wasted if ncolors < 4 */

again:

    // Here's the vector quantizer:
    for (k=0; k<50; k++) {

        // Find closest color, and track sums.
        for (i=0; i<ncolors; i++) {
            counts[i] = 0.0f;
            sums[i][0] = sums[i][1] = sums[i][2] = 0.0f;
            errors[i]    = 0.0f;
        }
        err = 0.0f;

        for (i=0; i<32; i++) {  // for each input point
            float   e;

            j = bestColorError(&input[i][0], (const float (*)[3])colors, ncolors, &e);
            counts[j] += 1.0f;
            sums[j][0] += (input[i][0]);
            sums[j][1] += (input[i][1]);
            sums[j][2] += (input[i][2]);

            err += e;
            errors[j] += e;
        }

        // Move each palette color to the barycenter of the set of input points that
        // were closest to its previous location.
        for (j=0; j<ncolors; j++) {
            float   rc;

            rc = (counts[j] == 0.0f) ? 1.0f : 1/counts[j];
            // XXXdwm  Shouldn't colors[] be snapped to the 555 grid points?
            colors[j][0] = (sums[j][0] * rc);
            colors[j][1] = (sums[j][1] * rc);
            colors[j][2] = (sums[j][2] * rc);
        }

#if 0
        printf("It: %3d err = %f\n", k, err);
#endif

        if ((err < 1.0f) || (ABS(lasterr - err) < 1.0f))
            break;
        lasterr = err;
    }
#if STATISTICS
        nvqChroma_inner[k+1]++;
#endif

#if 0
    printf("Alt VQ results: rep: %d, (%8.2f) %s\n", repeat, err, err != 0.0f ? "Bad" : "");
    for (i=0; i<ncolors; i++) {
        printf("[%3.0f %3.0f %3.0f] %2d (Error: %8.2f)\n", colors[i][0], colors[i][1], colors[i][2], (int) counts[i], errors[i]);
    }
    printf("Repeat = %2d, err = %8.2f, besterr = %8.2g\n", repeat, err, besterr);

#endif
    /*
     * Find worst fitting color and replace any item in the palette.
     */
    if (err < besterr) {
        besterr = err;
#if 0
        printf("%g,", besterr);
#endif
        memcpy( best, colors, ncolors * 3 * sizeof(float));
    } else {
    }

    if ((err < VQCHROMA_ERR_TARGET) || (--repeat <= 0))
        goto done;

    {
        float worsterr = -1.0f;
        int   worsti = 0;

        for (i=0; i<32; i++) {
            float   dr, dg, db;
            float   e;           /* distance according to the L-infinity metric */

            j = bestColor(&input[i][0], (const float (*)[3])colors, ncolors); /* distance according to the L-squared metric */
            dr = ABS( input[i][0] - colors[j][0] );
            dg = ABS( input[i][1] - colors[j][1] );
            db = ABS( input[i][2] - colors[j][2] );
            e = dr;
            if (dg > e) e = dg;
            if (db > e) e = db;
            if (e > worsterr) {
                worsterr = e;
                worsti   = i;
            }
        }

        /* If some palette entry is unused, use it; otherwise, gamble */
        /* XX What about an entry that has very few users, e.g. only one?  */
        for (i=0; i<ncolors; i++)
            if (counts[i] == 0.0f) break;

        if (i >= ncolors) i = rand() % ncolors;

#if 0
        printf("Repeat %d: repl %d [%3.0f %3.0f %3.0f] with [%3.0f %3.0f %3.0f]\n",
            repeat, colors[i][0], colors[i][1], colors[i][2],
            input[worsti][0], input[worsti][1], input[worsti][2]);
#endif

        /* Replace palette entry, and retry. */
        // XXXdwm  Shouldn't colors[] be snapped to the 555 grid points?
        colors[i][0] = input[worsti][0];
        colors[i][1] = input[worsti][1];
        colors[i][2] = input[worsti][2];
    }
    goto again;

done:
#if STATISTICS
    nvqChroma_outer[ MAX_REPEAT - repeat]++;
#endif

#if 0
    printf("\n");
#endif
    /* Scale colors back to 888 */
    for (i=0; i<ncolors; i++) {
        colors[i][0] = best[i][0] * 8.0f;
        colors[i][1] = best[i][1] * 8.0f;
        colors[i][2] = best[i][2] * 8.0f;
    }
}

static void
vqChromaAlpha(const float in[][3], FxI32 ain[], int ncolors, float colors[][4], FxU32 lerp)
{
    float   input[32][4]; // make alpha 4th comp to minimize code delta
    float   deltas[NCOLORS][4];
    float   errors[NCOLORS];    // XXX never read
    float   counts[NCOLORS];
    float   best[NCOLORS][4];
    float   besterr = 1.0e20f;  // infinity
    float   lasterr = 1.0e20f;

    float   alpha = 1.0f;       // XX no other writes to this !?
    float   oo8 = 1.0f/8.0f;
    float   err = 0.0f;
    int     i, j, k;
    int     repeat = 10;

    if ( ncolors > NCOLORS ) // check we have enough space
        txError("FXT1 vqChromaAlpha: invalid number of colors\n");

    // Copy input colors, chopping down to 555
    for (i=0; i<32; i++) {
        input[i][0] = in[i][0] * oo8;
        input[i][1] = in[i][1] * oo8;
        input[i][2] = in[i][2] * oo8;
        input[i][3] = ain[i] * oo8;
    }

    // Select ncolors initial colors from a grid
    colors[0][0] = input[ 0][0];
    colors[0][1] = input[ 0][1];
    colors[0][2] = input[ 0][2];
    colors[0][3] = input[ 0][3];
    colors[1][0] = input[10][0];
    colors[1][1] = input[10][1];
    colors[1][2] = input[10][2];
    colors[1][3] = input[10][3];
    colors[2][0] = input[16][0];
    colors[2][1] = input[16][1];
    colors[2][2] = input[16][2];
    colors[2][3] = input[16][3];
    colors[3][0] = input[26][0];  /* wasted if ncolors == 3 */
    colors[3][1] = input[26][1];  /* wasted if ncolors == 3 */
    colors[3][2] = input[26][2];  /* wasted if ncolors == 3 */
    colors[3][3] = input[26][3];  /* wasted if ncolors == 3 */

again:
    // Here's the vector quantizer:
    for (k=0; k<50; k++) {

        // Find closest color, and track deltas.
        for (i=0; i<ncolors; i++) {
            counts[i] = 0.0f;
            deltas[i][0] = deltas[i][1] = deltas[i][2] = deltas[i][3] = 0.0f;
            errors[i]    = 0.0f;
        }
        err = 0.0f;

        for (i=0; i<32; i++) {
            float   e0, e1, e2, e;

            j = bestColorAlpha(&input[i][0], input[i][3], (const float (*)[4])colors, ncolors, lerp);
            if ( !lerp && ( j == 3 )) continue; // transparent black handled specially
            counts[j] += 1.0f;
            deltas[j][0] += (input[i][0] - colors[j][0]) * alpha;
            deltas[j][1] += (input[i][1] - colors[j][1]) * alpha;
            deltas[j][2] += (input[i][2] - colors[j][2]) * alpha;
            deltas[j][3] += (input[i][3] - colors[j][3]) * alpha;

            if ( a_lerp ) {
                e0 = SQUARED(colors[j][0]*colors[j][3] - input[i][0]*input[i][3]);
                e1 = SQUARED(colors[j][1]*colors[j][3] - input[i][1]*input[i][3]);
                e2 = SQUARED(colors[j][2]*colors[j][3] - input[i][2]*input[i][3]);

                e0 = SQUARED(colors[j][0] - input[i][0]);
                e1 = SQUARED(colors[j][1] - input[i][1]);
                e2 = SQUARED(colors[j][2] - input[i][2]);

                e = ( SQUARED(colors[j][0]*colors[j][3] - input[i][0]*input[i][3]) +
                      SQUARED(colors[j][1]*colors[j][3] - input[i][1]*input[i][3]) +
                      SQUARED(colors[j][2]*colors[j][3] - input[i][2]*input[i][3]) );
                e = e0 + e1 + e2;                // XXXdwm  This overwrites the result of the previous line!?
            } else {
                e = ( SQUARED(colors[j][0] - input[i][0]) +
                      SQUARED(colors[j][1] - input[i][1]) +
                      SQUARED(colors[j][2] - input[i][2]) +
                      SQUARED(colors[j][3] - input[i][3]) );
            }
            err += e;
            errors[j] += e;
        }

        // Update colors.
        for (i=0; i<ncolors; i++) {
            float   c;

            c = (counts[i] == 0.0f) ? 1.0f : counts[i];
            colors[i][0] += (deltas[i][0] / c);
            colors[i][1] += (deltas[i][1] / c);
            colors[i][2] += (deltas[i][2] / c);
            colors[i][3] += (deltas[i][3] / c);
        }

#if 0
        printf("It: %3d err = %f\n", k, err);
#endif

        if ((err < 1.0f) || (ABS(lasterr - err) < 1.0f))
            break;
        lasterr = err;
    }

#if 0
    printf("Alt VQ results: rep: %d, (%8.2f) %s\n", repeat, err, err != 0.0f ? "Bad" : "");
    for (i=0; i<ncolors; i++) {
        printf("[%3.0f %3.0f %3.0f %3.0f] %2d (Error: %8.2f)\n",
               colors[i][0], colors[i][1], colors[i][2], colors[i][3], (int) counts[i], errors[i]);
    }
    printf("Repeat = %2d, err = %8.2f, besterr = %8.2g\n", repeat, err, besterr);

#endif
    /*
     * Find worst fitting color and replace any item in the palette.
     * in palette
     */
    if (err < besterr) {
        besterr = err;
#if 0
        printf("%g,", besterr);
#endif
        memcpy( best, colors, ncolors * 4 * sizeof(float));
    } else {
    }

    if ((err < VQCHROMA_ERR_TARGET) || (--repeat <= 0))
        goto done;

    {
        float worsterr = -1.0f;
        int   worsti = 0;

        for (i=0; i<32; i++) {
            float   dr, dg, db, da, e;

            j = bestColorAlpha(&input[i][0], input[i][3], (const float (*)[4])colors, ncolors, lerp);
            if ( !lerp && ( j == 3 )) continue;
            dr = ABS( input[i][0] - colors[j][0] );
            dg = ABS( input[i][1] - colors[j][1] );
            db = ABS( input[i][2] - colors[j][2] );
            da = ABS( input[i][3] - colors[j][3] );
            e = dr;
            if (dg > e) e = dg;
            if (db > e) e = db;
            if (da > e) e = da;
            if (e > worsterr) {
                worsterr = e;
                worsti   = i;
            }
        }

        /* If some palette entry is unused, use it; otherwise, gamble */
        for (i=0; i<ncolors; i++)
            if (counts[i] == 0.0f) break;

        if (i >= ncolors) i = rand() % ncolors;

#if 0
        printf("Repeat %d: repl %d [%3.0f %3.0f %3.0f] with [%3.0f %3.0f %3.0f]\n",
            repeat, colors[i][0], colors[i][1], colors[i][2], colors[i][3],
            input[worsti][0], input[worsti][1], input[worsti][2], input[worsti][3]);
#endif

        /* Replace palette entry, and retry. */
        colors[i][0] = input[worsti][0];
        colors[i][1] = input[worsti][1];
        colors[i][2] = input[worsti][2];
        colors[i][3] = input[worsti][3];
    }
    goto again;

done:

#if 0
    printf("\n");
#endif
    /* Scale colors back to 888 */
    for (i=0; i<ncolors; i++) {
        colors[i][0] = best[i][0] * 8.0f;
        colors[i][1] = best[i][1] * 8.0f;
        colors[i][2] = best[i][2] * 8.0f;
        colors[i][3] = best[i][3] * 8.0f;
    }
}

static int _cc_hi = 0;
static int _cc_mixed_3 = 0;
static int _cc_mixed_12 = 0;
static int _cc_mixed_0 = 0;
static int _cc_chroma = 0;
static int _cc_alpha = 0;

static void
encodeAlpha( float input[][3], FxI32  ainput[], void *bits, FxU32 lerp)
{
    FxU32   lo, hi, p[3], icol[3];
    float   col[4][4];
    float   fpal[4][4];
    int     i, index[32];

    vqChromaAlpha( (const float (*)[3])input, ainput, 3, col, lerp);

    if ( lerp ) {
        /* Deal with even block */
        lo = ARGB( (int)col[0][3], (int) col[0][0], (int) col[0][1], (int) col[0][2]);
        hi = ARGB( (int)col[1][3], (int) col[1][0], (int) col[1][1], (int) col[1][2]);
        makePaletteAlpha( lo, hi, 4, fpal);
        icol[0] = argb5555( lo );
        icol[1] = argb5555( hi );

        /* Map input colors to closest entry in the palette */
        for (i=0; i<16; i++) {
            index[i] = bestColorAlpha(&input[i][0], (float)ainput[i], (const float (*)[4])fpal, 4, lerp);
        }

        /* Now deal with odd block */
        lo = ARGB( (int)col[2][3], (int) col[2][0], (int) col[2][1], (int) col[2][2]);
        hi = ARGB( (int)col[1][3], (int) col[1][0], (int) col[1][1], (int) col[1][2]);
        makePaletteAlpha( lo, hi, 4, fpal);
        icol[2] = argb5555( hi );

        /* Map input colors to closest entry in the palette */
        for (i=16; i<32; i++) {
            index[i] = bestColorAlpha(&input[i][0], (float)ainput[i], (const float (*)[4])fpal, 4, lerp);
        }
    } else { // no interpolation
        p[0] = ARGB( (int)col[0][3], (int) col[0][0], (int) col[0][1], (int) col[0][2]);
        p[1] = ARGB( (int)col[1][3], (int) col[1][0], (int) col[1][1], (int) col[1][2]);
        p[2] = ARGB( (int)col[2][3], (int) col[2][0], (int) col[2][1], (int) col[2][2]);
        icol[0] = argb5555( p[0] );
        icol[1] = argb5555( p[1] );
        icol[2] = argb5555( p[2] );

        /* Map input colors to closest entry in the palette */
        for (i=0; i<32; i++) {
            index[i] = bestColorAlpha(&input[i][0], (float)ainput[i], (const float (*)[4])col, 3, lerp);
        }
    }

    /* Now encode these into the 128 bits */
    bitEncoder( TCC_ALPHA, icol, lerp, index, bits);
    _cc_alpha++;
}

#if STATISTICS
static int h_nunique[32];
#endif

static void
quantize4bpp_block(float input[][3], FxI32  ainput[], void *bits)
{
    float   Evalues[3];                             // even  block Eigen values
    float   Ovalues[3];                             // odd   block Eigen values
    float   Wvalues[3];                             // whole block Eigen values

    float   Eavg[3], Emin[3], Emax[3], Eerr[3];     // even  block
    float   Oavg[3], Omin[3], Omax[3], Oerr[3];     // odd   block
    float   Wavg[3], Wmin[3], Wmax[3], Werr[3];     // whole block

    float   Eflo[3][3], Efhi[3][3];                 // even  block
    float   Oflo[3][3], Ofhi[3][3];                 // odd   block
    float   Wflo[3][3], Wfhi[3][3];                 // whole block

    float   output[32][3];
    float   col[4][3];
    int     submode = 0;
    int     i, alpha = 0;

#define ASSUME_ALPHA_EQUALS_ONE 0  // XX Set this to work around an apparent bug in PaintShop
#if ASSUME_ALPHA_EQUALS_ONE
#else
    // determine alpha properties:
    //     alpha == 0 => opaque,
    //     alpha == 1 => bimodal (opaque or transp)
    //     alpha == 2 => partially transparent
    for (i=0; i<32; i++) {
        // if alpha differs from 0 or 255 within tolerance it can still use non-alpha blocks.
        if ( ainput[i] >= ( 255 - a_tolerance ) )
            ainput[i] = 255;

        if ( ainput[i] <=  a_tolerance )
            ainput[i] = 0;

        if ( ainput[i] == 0 )  // XXXdwm  Also condition on: "&& alpha != 2" ?
            alpha = 1;
        else if ( ainput[i] != 255 )
            alpha = 2;
    }
#endif

    // whole block statistics
    eigenStatistics(32, (const float (*)[3])input, Wvalues, output, Wflo, Wfhi, Wavg /*not used*/, Wmin, Wmax, Werr);

#if PRINT
    fprintf(stderr, "NEW TILE----------------------(%4d %4d)\n", globalX, globalY);
    printStatistics(32, input, output, Wflo, Wfhi, Wavg, Wmin, Wmax, Werr, "Whole Block\n");
    printStatistics(16, NULL, NULL  , Eflo, Efhi, Eavg, Emin, Emax, Eerr, "Even  Block\n");
    printStatistics(16, NULL, NULL  , Oflo, Ofhi, Oavg, Omin, Omax, Oerr, "Odd   Block\n");
#endif

    if (a_force_cc != -1) {
        // int loEven, loOdd;

        switch (a_force_cc) {
        case TCC_HI:
            encodeColors( TCC_HI, 0, alpha,
               &Wflo[0][0], &Wfhi[0][0], NULL, NULL, input, ainput, bits);
            _cc_hi++;
            return;

        case TCC_MIXED:
            submode = 0;
            // Even, odd block statistics
            eigenStatistics(16, (const float(*)[3]) &input[ 0][0], Evalues, output,
                  Eflo, Efhi, Eavg /*not used*/, Emin, Emax, Eerr /*not used*/);
            eigenStatistics(16, (const float(*)[3]) &input[16][0], Ovalues, output,
                  Oflo, Ofhi, Oavg /*not used*/, Omin, Omax, Oerr /*not used*/);

            encodeColors( TCC_MIXED, submode, alpha,
              &Eflo[0][0], &Efhi[0][0], &Oflo[0][0], &Ofhi[0][0], input, ainput, bits);
            _cc_mixed_0++;
            return;

        case TCC_CHROMA:
            vqChroma( (const float (*)[3])input, alpha ? 3 : 4, col);
            encodeColors( TCC_CHROMA, 0, 0,
                &col[0][0], &col[1][0], &col[2][0], &col[3][0], input, ainput, bits);
            _cc_chroma++;
            return;

        case TCC_ALPHA:
            encodeAlpha( input, ainput, bits, a_lerp );
        }
        return;
    }

    if (( alpha == 2 ) || ((alpha == 1 ) && (Werr[1] >= 20))) {
        // strong alpha component or strong color component with alpha use TCC_ALPHA
        // LOOOK need to determine whether to interpolate or not
        encodeAlpha( input, ainput, bits, a_lerp);
        return;
    }

    // Even, odd block statistics
    eigenStatistics(16, (const float(*)[3]) &input[ 0][0], Evalues, output,
    	Eflo, Efhi, Eavg /*not used*/, Emin, Emax, Eerr /*not used*/);
    eigenStatistics(16, (const float(*)[3]) &input[16][0], Ovalues, output,
    	Oflo, Ofhi, Oavg /*not used*/, Omin, Omax, Oerr /*not used*/);

    // If color distribution is not "sufficiently" oblong, go CHROMA
    // XXX Dither if something like the following condition is satisfied?
    //         (Werr[0] < 4*4)   // colors in a chroma block * best case color resolution (green)
    // Note that left and right halves might be each separately oblong, but not the whole.
    if ( fabs(Evalues[1])+fabs(Evalues[2]) > 8 ||
         fabs(Ovalues[1])+fabs(Evalues[2]) > 8)
    {

#if STATISTICS
    	{
    	    int nunique = 0;
    	    int iu, ii;
    	    float unique[32][3];
    	    int h_unique[32];

    	    for (ii=0; ii<32; ii++)
    	    {
    		for (iu=0; iu<nunique; iu++)
    		{
    		    if ( input[ii][0] == unique[iu][0] &&
    			 input[ii][1] == unique[iu][1] &&
    			 input[ii][2] == unique[iu][2])
    		    {
    			h_unique[iu]++;
    			goto next_input;
    		    }
    		}
    		unique[nunique][0] = input[ii][0];
    		unique[nunique][1] = input[ii][1];
    		unique[nunique][2] = input[ii][2];
    		nunique++;
    	      next_input:
                ;
    	    }

    	    h_nunique[nunique]++;
    	}
#endif

        vqChroma( (const float (*)[3])input, alpha ? 3 : 4, col);
        encodeColors( TCC_CHROMA, 0, alpha,
            &col[0][0], &col[1][0], &col[2][0], &col[3][0], input, ainput, bits);
        _cc_chroma++;
        return;
    }

    {
#ifdef notdef
        // commented out to get rid of error on VC++ 6.0
        // int loEven, loOdd;
#endif

        int skewed;

        /*
         * No chrominance, only intensity changes in this block, so we
         * ignore the 2nd eigenvector
         *
         * Now we decide between coding at 7 levels, or 4 levels
         */
        // XXX dwm 'min' and 'max' are in eigen space; not sure what this expression means.
        //         Also, it's correct only if all 'min's and 'max's are positive.
        skewed = (ABS(ABS(Wmin[0]) - ABS(Wmax[0])) > 32) ||
                 (ABS(ABS(Emin[0]) - ABS(Emax[0])) > 32) ||
                 (ABS(ABS(Omin[0]) - ABS(Omax[0])) > 32) ;

#ifdef notdef
        loEven = (ABS(Eflo[0][0] - Efhi[0][0]) < 15) &&
                 (ABS(Eflo[0][1] - Efhi[0][1]) < 15) &&
                 (ABS(Eflo[0][2] - Efhi[0][2]) < 15) ;


        loOdd  = (ABS(Oflo[0][0] - Ofhi[0][0]) < 15) &&
                 (ABS(Oflo[0][1] - Ofhi[0][1]) < 15) &&
                 (ABS(Oflo[0][2] - Ofhi[0][2]) < 15) ;

        if (loEven && loOdd) {
            // Both halves have small variations only, split to 4 levels each
            encodeColors( TCC_MIXED, 3, alpha,
               &Eflo[0][0], &Efhi[0][0], &Oflo[0][0], &Ofhi[0][0], input, ainput, bits);
            _cc_mixed_3++;
            return;
        }

        if ((loEven || loOdd)){
            submode = 0;
            if (loEven) submode |= 1;
            if (loOdd ) submode |= 2;
            encodeColors( TCC_MIXED, submode, alpha,
               &Eflo[0][0], &Efhi[0][0], &Oflo[0][0], &Ofhi[0][0], input, ainput, bits);
            _cc_mixed_12++;
            return;
        }
#endif

#define DWM 1
#if DWM // Distribution is known to be oblong at this point due to failure
        // of CHROMA test above.
//	  // Now determine whether range along major color axis is compact relative to
//	  // total space.  This will mean that the distribution cannot
//	  // be severely 'grouped' left versus right in the block, which
//	  // causes TCC_HI to produce bad results.
//	  // 7 is number of levels in TCC_HI; 4 is best case truncation error (green)
//	 (Werr[0] <= 7*4)    // in color units
//        if ( Werr[0] <= 7*4 &&
//             (fabs(Wvalues[1])+fabs(Wvalues[2])) < 500)
        if ( 0)
#else
        // Neither half has small variations, split whole block into 8 levels
        if (skewed)
#endif
        {
            encodeColors( TCC_HI, 0, alpha,
               &Wflo[0][0], &Wfhi[0][0], NULL, NULL, input, ainput, bits);
            _cc_hi++;
            return;
        } else {
            encodeColors( TCC_MIXED, 0, alpha,
               &Eflo[0][0], &Efhi[0][0], &Oflo[0][0], &Ofhi[0][0], input, ainput, bits);
            _cc_mixed_0++;
            return;
        }
    }
}

static void
encode4bpp_block(
    int *pp0,
    int *pp1,
    int *pp2,
    int *pp3,
    int *code)
{
    float   input[32][3];
    FxI32   ainput[32];
    int     i;

    /* This maps to [0.5, 255.5], consistent with mappings elsewhere.
     * To understand why this mapping is best, consider that arithmetic means of these
     * values will be taken later.  The mean of two adjacent values should lie exactly
     * on the boundary between them implied by truncations to lower precision.
     */
    /* Convert input to input vectors */
    for (i=0; i<4; i++) {
        // 1st block of 4x4
        ainput[ 0 + i] = ALF(pp0[i]);
        input[ 0 + i][0] = (float) (RED(pp0[i])) + 0.5f;
        input[ 0 + i][1] = (float) (GRN(pp0[i])) + 0.5f;
        input[ 0 + i][2] = (float) (BLU(pp0[i])) + 0.5f;

        ainput[ 4 + i] = ALF(pp1[i]);
        input[ 4 + i][0] = (float) (RED(pp1[i])) + 0.5f;
        input[ 4 + i][1] = (float) (GRN(pp1[i])) + 0.5f;
        input[ 4 + i][2] = (float) (BLU(pp1[i])) + 0.5f;

        ainput[ 8 + i] = ALF(pp2[i]);
        input[ 8 + i][0] = (float) (RED(pp2[i])) + 0.5f;
        input[ 8 + i][1] = (float) (GRN(pp2[i])) + 0.5f;
        input[ 8 + i][2] = (float) (BLU(pp2[i])) + 0.5f;

        ainput[12 + i] = ALF(pp3[i]);
        input[12 + i][0] = (float) (RED(pp3[i])) + 0.5f;
        input[12 + i][1] = (float) (GRN(pp3[i])) + 0.5f;
        input[12 + i][2] = (float) (BLU(pp3[i])) + 0.5f;

        // 2nd block of 4x4
        ainput[16 + i] = ALF(pp0[i+4]);
        input[16 + i][0] = (float) (RED(pp0[i+4])) + 0.5f;
        input[16 + i][1] = (float) (GRN(pp0[i+4])) + 0.5f;
        input[16 + i][2] = (float) (BLU(pp0[i+4])) + 0.5f;

        ainput[20 + i] = ALF(pp1[i+4]);
        input[20 + i][0] = (float) (RED(pp1[i+4])) + 0.5f;
        input[20 + i][1] = (float) (GRN(pp1[i+4])) + 0.5f;
        input[20 + i][2] = (float) (BLU(pp1[i+4])) + 0.5f;

        ainput[24 + i] = ALF(pp2[i+4]);
        input[24 + i][0] = (float) (RED(pp2[i+4])) + 0.5f;
        input[24 + i][1] = (float) (GRN(pp2[i+4])) + 0.5f;
        input[24 + i][2] = (float) (BLU(pp2[i+4])) + 0.5f;

        ainput[28 + i] = ALF(pp3[i+4]);
        input[28 + i][0] = (float) (RED(pp3[i+4])) + 0.5f;
        input[28 + i][1] = (float) (GRN(pp3[i+4])) + 0.5f;
        input[28 + i][2] = (float) (BLU(pp3[i+4])) + 0.5f;
    }

    quantize4bpp_block(input, ainput, code);
}

#define FARGB(a, r, g, b)    (ARGB( (int) a, (int) r, (int) g, (int) b) )

static void
decode4bpp_block(
    int *code,
    int *pp0,
    int *pp1,
    int *pp2,
    int *pp3)
{
    float       output[32][4];  // order AGBR
    int         i;

    decodeColors(code, output);

    // Decode and put it back into source array right away!
    for (i=0; i<4; i++) {
        pp0[i+0] = FARGB(output[ 0 + i][0], output[ 0+i][1], output[ 0+i][2], output[ 0+i][3]);
        pp1[i+0] = FARGB(output[ 4 + i][0], output[ 4+i][1], output[ 4+i][2], output[ 4+i][3]);
        pp2[i+0] = FARGB(output[ 8 + i][0], output[ 8+i][1], output[ 8+i][2], output[ 8+i][3]);
        pp3[i+0] = FARGB(output[12 + i][0], output[12+i][1], output[12+i][2], output[12+i][3]);

        pp0[i+4] = FARGB(output[16 + i][0], output[16+i][1], output[16+i][2], output[16+i][3]);
        pp1[i+4] = FARGB(output[20 + i][0], output[20+i][1], output[20+i][2], output[20+i][3]);
        pp2[i+4] = FARGB(output[24 + i][0], output[24+i][1], output[24+i][2], output[24+i][3]);
        pp3[i+4] = FARGB(output[28 + i][0], output[28+i][1], output[28+i][2], output[28+i][3]);
    }
}


void
sst2FXT1Encode4bpp(int *data, int width, int height, int* encoded)
{
    int x, y ;

    for (y=0; y < height; y += 4) {
        for (x=0; x < width; x += 8) {
            globalX = x;
            globalY = y;
            encode4bpp_block(
                &data[x + (y + 0) * width],
                &data[x + (y + 1) * width],
                &data[x + (y + 2) * width],
                &data[x + (y + 3) * width],
                encoded);
            encoded += 4;       // 128 bits per 8x4 block = 4bpp
        }
    }
    // fprintf(stderr, "%d alpha, %d chroma, %d mixed3, %d mixed12, %d mixed0, %d hi\n",
    //         _cc_alpha, _cc_chroma, _cc_mixed_3, _cc_mixed_12, _cc_mixed_0, _cc_hi);
}

void
sst2FXT1Decode4bpp(int *encoded, int width, int height, int *data)
{
    int x, y ;

    for (y=0; y < height; y += 4) {
        for (x=0; x < width; x += 8) {
            globalX = x;
            globalY = y;
            decode4bpp_block(
                encoded,
                &data[x + (y + 0) * width],
                &data[x + (y + 1) * width],
                &data[x + (y + 2) * width],
                &data[x + (y + 3) * width] );
            encoded += 4;       // 128 bits per 8x4 block = 4bpp
        }
    }
}