added LFB read/write MMX specializations

This commit is contained in:
dborca
2003-06-27 10:40:52 +00:00
parent e115acc717
commit 0ddbb273b6

View File

@@ -18,7 +18,7 @@
** COPYRIGHT 3DFX INTERACTIVE, INC. 1999, ALL RIGHTS RESERVE
**
** $Header$
** $Log:
** $Log$
**
** 02/23/03 KoolSmoky - merged with Colourless's sources
** 14 3dfx 1.7.1.2.1.2 10/11/00 Brent Forced check in to enforce
@@ -54,7 +54,6 @@
** 32-bit buffers. Added big-endian swizzling support for PowerPC systems.
** Read locks now return pixel format in lfbInfo.
** 1 3dfx 1.0 09/11/99 StarTeam VTS Administrator
** $
**
** 33 7/14/99 9:39a Atai
** direct register write for glide3x
@@ -286,6 +285,332 @@
#include <lindri.h>
#endif /* defined(__linux__) */
/*---------------------------------------------------------------------------
** MMX specializations
*/
#ifdef __GNUC__
#define MMX_RESET() __asm __volatile ("emms")
#define MMX_SRCLINE(src, dst, length) __asm __volatile ("\n\
movl %2, %%ecx \n\
movl %0, %%esi \n\
movl %1, %%edi \n\
cmpl $8, %%ecx \n\
jb 4f \n\
testl $2, %%esi \n\
jz 1f \n\
movw (%%esi), %%ax \n\
addl $2, %%esi \n\
movw %%ax, (%%edi) \n\
addl $2, %%edi \n\
subl $2, %%ecx \n\
.p2align 3,,7 \n\
1: \n\
testl $4, %%esi \n\
jz 2f \n\
movl (%%esi), %%eax \n\
addl $4, %%esi \n\
movl %%eax, (%%edi) \n\
addl $4, %%edi \n\
subl $4, %%ecx \n\
.p2align 3,,7 \n\
2: \n\
movl %%ecx, %%eax \n\
andl $7, %%ecx \n\
shrl $3, %%eax \n\
jz 4f \n\
.p2align 3,,7 \n\
3: \n\
movq (%%esi), %%mm0 \n\
addl $8, %%esi \n\
movq %%mm0, (%%edi) \n\
addl $8, %%edi \n\
decl %%eax \n\
jnz 3b \n\
.p2align 3,,7 \n\
4: \n\
testl $4, %%ecx \n\
jz 5f \n\
movl (%%esi), %%eax \n\
addl $4, %%esi \n\
movl %%eax, (%%edi) \n\
addl $4, %%edi \n\
.p2align 3,,7 \n\
5: \n\
testl $2, %%ecx \n\
jz 6f \n\
movw (%%esi), %%ax \n\
movw %%ax, (%%edi) \n\
.p2align 3,,7 \n\
6:"::"g"(src), "g"(dst), "g"(length):"%eax", "%ecx", "%esi", "%edi")
#define MMX_DSTLINE2(src, dst, width) __asm __volatile ("\n\
movl %2, %%ecx \n\
movl %0, %%esi \n\
movl %1, %%edi \n\
cmpl $4, %%ecx \n\
jb 4f \n\
testl $2, %%edi \n\
jz 1f \n\
movw (%%esi), %%ax \n\
addl $2, %%esi \n\
movw %%ax, (%%edi) \n\
addl $2, %%edi \n\
decl %%ecx \n\
.p2align 3,,7 \n\
1: \n\
testl $4, %%edi \n\
jz 2f \n\
movl (%%esi), %%eax \n\
addl $4, %%esi \n\
movl %%eax, (%%edi) \n\
addl $4, %%edi \n\
subl $2, %%ecx \n\
.p2align 3,,7 \n\
2: \n\
movl %%ecx, %%eax \n\
andl $3, %%ecx \n\
shrl $2, %%eax \n\
jz 4f \n\
.p2align 3,,7 \n\
3: \n\
movq (%%esi), %%mm0 \n\
addl $8, %%esi \n\
movq %%mm0, (%%edi) \n\
addl $8, %%edi \n\
decl %%eax \n\
jnz 3b \n\
.p2align 3,,7 \n\
4: \n\
testl $2, %%ecx \n\
jz 5f \n\
movl (%%esi), %%eax \n\
addl $4, %%esi \n\
movl %%eax, (%%edi) \n\
addl $4, %%edi \n\
.p2align 3,,7 \n\
5: \n\
testl $1, %%ecx \n\
jz 6f \n\
movw (%%esi), %%ax \n\
movw %%ax, (%%edi) \n\
.p2align 3,,7 \n\
6:"::"g"(src), "g"(dst), "g"(width):"%eax", "%ecx", "%esi", "%edi")
#define MMX_DSTLINE4(src, dst, width) __asm __volatile ("\n\
movl %2, %%ecx \n\
movl %0, %%esi \n\
movl %1, %%edi \n\
cmpl $2, %%ecx \n\
jb 4f \n\
testl $1, %%edi \n\
jz 2f \n\
movl (%%esi), %%eax \n\
addl $4, %%esi \n\
movl %%eax, (%%edi) \n\
addl $4, %%edi \n\
decl %%ecx \n\
.p2align 3,,7 \n\
2: \n\
movl %%ecx, %%eax \n\
andl $1, %%ecx \n\
shrl %%eax \n\
jz 4f \n\
.p2align 3,,7 \n\
3: \n\
movq (%%esi), %%mm0 \n\
addl $8, %%esi \n\
movq %%mm0, (%%edi) \n\
addl $8, %%edi \n\
decl %%eax \n\
jnz 3b \n\
.p2align 3,,7 \n\
4: \n\
testl $1, %%ecx \n\
jz 5f \n\
movl (%%esi), %%eax \n\
movl %%eax, (%%edi) \n\
.p2align 3,,7 \n\
5:"::"g"(src), "g"(dst), "g"(width):"%eax", "%ecx", "%esi", "%edi")
#else
#define MMX_RESET() __asm { emms }
/* Desc: copy one row of pixels
*
* In : length = number of bytes (pixels*bytesperpixel). Must be even (which
* holds true, because we don't support 8bit. Also the existing
* code assumes this, so this won't be a problem.
* src = source buffer
* dst = destination buffer
*
* Note: Aligns src (LFB) before copying. Clobbers eax, ecx, esi, edi
*/
#define MMX_SRCLINE(src, dst, length) __asm {\
__asm mov ecx, length \
__asm mov esi, src \
__asm mov edi, dst \
__asm cmp ecx, 8 \
__asm jb small_move \
__asm test esi, 2 \
__asm jz check4 \
__asm mov ax, [esi] \
__asm add esi, 2 \
__asm mov [edi], ax \
__asm add edi, 2 \
__asm sub ecx, 2 \
__asm align 8 \
__asm check4: \
__asm test esi, 4 \
__asm jz aligned8 \
__asm mov eax, [esi] \
__asm add esi, 4 \
__asm mov [edi], eax \
__asm add edi, 4 \
__asm sub ecx, 4 \
__asm align 8 \
__asm aligned8: \
__asm mov eax, ecx \
__asm and ecx, 7 \
__asm shr eax, 3 \
__asm jz small_move \
__asm align 8 \
__asm big_move: \
__asm movq mm0, [esi] \
__asm add esi, 8 \
__asm movq [edi], mm0 \
__asm add edi, 8 \
__asm dec eax \
__asm jnz big_move \
__asm align 8 \
__asm small_move: \
__asm test ecx, 4 \
__asm jz check2 \
__asm mov eax, [esi] \
__asm add esi, 4 \
__asm mov [edi], eax \
__asm add edi, 4 \
__asm align 8 \
__asm check2: \
__asm test ecx, 2 \
__asm jz finish \
__asm mov ax, [esi] \
__asm mov [edi], ax \
__asm align 8 \
__asm finish: \
}
/* Desc: copy one row of 16bit pixels
*
* In : width = number of pixels
* src = source buffer
* dst = destination buffer
*
* Note: Aligns dst (LFB) before copying. Clobbers eax, ecx, esi, edi
*/
#define MMX_DSTLINE2(src, dst, width) __asm {\
__asm mov ecx, width \
__asm mov esi, src \
__asm mov edi, dst \
__asm cmp ecx, 4 \
__asm jb small_move_mmx_dstline2 \
__asm test edi, 2 \
__asm jz check4_mmx_dstline2 \
__asm mov ax, [esi] \
__asm add esi, 2 \
__asm mov [edi], ax \
__asm add edi, 2 \
__asm dec ecx \
__asm align 8 \
__asm check4_mmx_dstline2: \
__asm test edi, 4 \
__asm jz aligned8_mmx_dstline2 \
__asm mov eax, [esi] \
__asm add esi, 4 \
__asm mov [edi], eax \
__asm add edi, 4 \
__asm sub ecx, 2 \
__asm align 8 \
__asm aligned8_mmx_dstline2: \
__asm mov eax, ecx \
__asm and ecx, 3 \
__asm shr eax, 2 \
__asm jz small_move_mmx_dstline2 \
__asm align 8 \
__asm big_move_mmx_dstline2: \
__asm movq mm0, [esi] \
__asm add esi, 8 \
__asm movq [edi], mm0 \
__asm add edi, 8 \
__asm dec eax \
__asm jnz big_move_mmx_dstline2 \
__asm align 8 \
__asm small_move_mmx_dstline2: \
__asm test ecx, 2 \
__asm jz check2_mmx_dstline2 \
__asm mov eax, [esi] \
__asm add esi, 4 \
__asm mov [edi], eax \
__asm add edi, 4 \
__asm align 8 \
__asm check2_mmx_dstline2: \
__asm test ecx, 1 \
__asm jz finish_mmx_dstline2 \
__asm mov ax, [esi] \
__asm mov [edi], ax \
__asm align 8 \
__asm finish_mmx_dstline2: \
}
/* Desc: copy one row of 32bit pixels
*
* In : width = number of pixels
* src = source buffer
* dst = destination buffer
*
* Note: Aligns dst (LFB) before copying. Clobbers eax, ecx, esi, edi
*/
#define MMX_DSTLINE4(src, dst, width) __asm {\
__asm mov ecx, width \
__asm mov esi, src \
__asm mov edi, dst \
__asm cmp ecx, 2 \
__asm jb small_move_mmx_dstline4 \
__asm test edi, 4 \
__asm jz aligned8_mmx_dstline4 \
__asm mov eax, [esi] \
__asm add esi, 4 \
__asm mov [edi], eax \
__asm add edi, 4 \
__asm dec ecx \
__asm align 8 \
__asm aligned8_mmx_dstline4: \
__asm mov eax, ecx \
__asm and ecx, 1 \
__asm shr eax, 1 \
__asm jz small_move_mmx_dstline4 \
__asm align 8 \
__asm big_move_mmx_dstline4: \
__asm movq mm0, [esi] \
__asm add esi, 8 \
__asm movq [edi], mm0 \
__asm add edi, 8 \
__asm dec eax \
__asm jnz big_move_mmx_dstline4 \
__asm align 8 \
__asm small_move_mmx_dstline4: \
__asm test ecx, 1 \
__asm jz finish_mmx_dstline4 \
__asm mov eax, [esi] \
__asm mov [edi], eax \
__asm align 8 \
__asm finish_mmx_dstline4: \
}
#endif
/*---------------------------------------------------------------------------
** grLfbConstantAlpha
*/
@@ -379,7 +704,7 @@ static FxBool _grLfbLock (GrLock_t type, GrBuffer_t buffer,
/* Pray that no one has made any glide calls that touch the hardware... */
#ifdef FX_GLIDE_NAPALM
if((gc->sliCount > 1)/* &&
if((gc->sliCount > 1) /*&&
(type == GR_LFB_READ_ONLY)*/) {
hwcSLIReadDisable(gc->bInfo);
}
@@ -901,7 +1226,7 @@ static FxBool _grLfbLock (GrLock_t type, GrBuffer_t buffer,
}
/* Pray that no one makes any glide calls that touch the hardware... */
#ifdef FX_GLIDE_NAPALM
if((gc->sliCount > 1)/* &&
if((gc->sliCount > 1) /*&&
(type == GR_LFB_READ_ONLY)*/) {
hwcSLIReadEnable(gc->bInfo);
}
@@ -1114,7 +1439,7 @@ static FxBool _grLfbUnlock (GrLock_t type, GrBuffer_t buffer)
gc->lockPtrs[type] = (FxU32)-1;
#ifdef FX_GLIDE_NAPALM
if((gc->sliCount > 1)/* &&
if((gc->sliCount > 1) /*&&
(type == GR_LFB_READ_ONLY)*/) {
hwcSLIReadDisable(gc->bInfo);
}
@@ -1171,7 +1496,7 @@ static FxBool _grLfbUnlock (GrLock_t type, GrBuffer_t buffer)
gc->cmdTransportInfo.lfbLockCount = lockCount - 1;
#ifdef FX_GLIDE_NAPALM
if((gc->sliCount > 1)/* &&
if((gc->sliCount > 1) /*&&
(type == GR_LFB_READ_ONLY)*/) {
if(gc->cmdTransportInfo.lfbLockCount != 0) {
grFinish();
@@ -1343,6 +1668,11 @@ _grLfbWriteRegion(FxBool pixPipelineP,
src_format, src_width, src_height,
src_stride, src_data);
/* don't waste time */
if (!(src_width && src_height)) {
goto done;
}
writeMode = ((src_format == GR_LFB_SRC_FMT_RLE16)
? GR_LFBWRITEMODE_565
: src_format);
@@ -1375,6 +1705,18 @@ _grLfbWriteRegion(FxBool pixPipelineP,
case GR_LFB_SRC_FMT_1555:
case GR_LFB_SRC_FMT_ZA16:
dstData = (FxU32*)(((FxU16*)dstData) + dst_x);
#if 1 /* Hack alert: disable if SET_LFB_16 is not simple assignment */
if (_GlideRoot.CPUType.os_support & _CPU_FEATURE_MMX) {
do {
MMX_DSTLINE2(srcData, dstData, src_width);
/* adjust for next line */
((FxU8 *)srcData) += src_stride;
((FxU8 *)dstData) += info.strideInBytes;
} while (--scanline);
MMX_RESET();
break;
}
#endif
length = src_width * 2;
aligned = !((int)dstData&0x2);
srcJump = src_stride - length;
@@ -1433,6 +1775,18 @@ _grLfbWriteRegion(FxBool pixPipelineP,
case GR_LFB_SRC_FMT_555_DEPTH:
case GR_LFB_SRC_FMT_1555_DEPTH:
dstData = ((FxU32*)dstData) + dst_x;
#if 1 /* Hack alert: disable if SET_LFB is not simple assignment */
if (_GlideRoot.CPUType.os_support & _CPU_FEATURE_MMX) {
do {
MMX_DSTLINE4(srcData, dstData, src_width);
/* adjust for next line */
((FxU8 *)srcData) += src_stride;
((FxU8 *)dstData) += info.strideInBytes;
} while (--scanline);
MMX_RESET();
break;
}
#endif
length = src_width * 4;
srcJump = src_stride - length;
dstJump = info.strideInBytes - length;
@@ -1458,6 +1812,7 @@ _grLfbWriteRegion(FxBool pixPipelineP,
rv = FXFALSE;
}
done:
GR_RETURN(rv);
#undef FN_NAME
} /* _grLfbWriteRegion */
@@ -1559,13 +1914,14 @@ static FxBool grLfbReadRegionOrigin (GrBuffer_t src_buffer, GrOriginLocation_t o
src_buffer, src_x, src_y,
src_width, src_height, dst_stride, dst_data);
/* don't waste time */
if (!(src_width && src_height)) {
rv=FXTRUE;
goto done;
}
bpp=gc->bInfo->h3pixelSize;
info.size = sizeof(info);
if (!src_width)
{
rv=FXTRUE;
goto done;
}
rv=FXFALSE;
#ifndef __linux__ /* [dBorca] fixme :D */
@@ -1632,17 +1988,31 @@ static FxBool grLfbReadRegionOrigin (GrBuffer_t src_buffer, GrOriginLocation_t o
&info))
{
FxU32 *src,*dst;
FxI32 length,scanline;
FxI32 length;
FxU32 src_adjust,dst_adjust,tmp;
src=(FxU32 *) (((char*)info.lfbPtr)+
(src_y*info.strideInBytes) + (src_x * bpp));
length = src_width * bpp;
if (_GlideRoot.CPUType.os_support & _CPU_FEATURE_MMX) {
if (!gc->state.forced32BPP) {
do {
MMX_SRCLINE(src, dst_data, length);
/* adjust for next line */
((FxU8 *)src) += info.strideInBytes;
((FxU8 *)dst_data) += dst_stride;
} while (--src_height);
MMX_RESET();
goto okay;
}
}
dst=dst_data;
scanline=src_height;
/* set length - alignment fix*/
tmp=(((FxU32)src)&2);
length=src_width * bpp - tmp;
length -= tmp;
src_adjust=info.strideInBytes - tmp;
dst_adjust=dst_stride - tmp;
@@ -1723,6 +2093,7 @@ static FxBool grLfbReadRegionOrigin (GrBuffer_t src_buffer, GrOriginLocation_t o
((FxU8 *)dst)+=dst_stride;
}
okay:
rv=FXTRUE;
/* unlock buffer */
_grLfbUnlock(GR_LFB_READ_ONLY,src_buffer);