fixed 3dnow! and mmx optimizations

This commit is contained in:
koolsmoky
2005-04-23 18:36:36 +00:00
parent 881c63f226
commit 74da7049a3
4 changed files with 154 additions and 557 deletions

View File

@@ -29,6 +29,8 @@
# capabilities are still checked at run-time to avoid
# crashes.
# default = no
# USE_MMX=1 allow MMX specializations.
# default = no
# TEXUS2=1 embed Texus2 functions into Glide2.
# default = no
# FXOEM2X=1 build fxoem2x.dll
@@ -103,18 +105,7 @@ CDEFS += -DGDBG_INFO_ON -DGLIDE_DEBUG -DGLIDE_SANITY_ASSERT -DGLIDE_SANITY_SIZE
endif
override USE_FIFO = 1
override USE_X86 = 1
# cpu optimized triangle
ifeq ($(USE_MMX),1)
CFLAGS += -DGL_MMX
override USE_X86 = 1
endif
ifeq ($(USE_3DNOW),1)
CFLAGS += -DGL_AMD3D
override USE_X86 = 1
endif
#override USE_X86 = 1
ifeq ($(USE_X86),1)
CDEFS += -DGLIDE_DISPATCH_SETUP=1 -DGLIDE_DISPATCH_DOWNLOAD=1
@@ -165,6 +156,17 @@ CFLAGS += -I$(FX_GLIDE_SW)/fxmisc -I$(FX_GLIDE_SW)/newpci/pcilib -I$(FX_GLIDE_SW
CFLAGS += -I$(FX_GLIDE_SW)/texus2/lib
CFLAGS += $(CDEFS)
# cpu optimized triangle
ifeq ($(USE_MMX),1)
CFLAGS += -DGL_MMX
override USE_X86 = 1
endif
ifeq ($(USE_3DNOW),1)
CFLAGS += -DGL_AMD3D
override USE_X86 = 1
endif
###############################################################################
# objects
###############################################################################

View File

@@ -19,6 +19,9 @@
;; $Header$
;; $Revision$
;; $Log$
;; Revision 1.1.1.1.2.3 2005/01/22 14:52:02 koolsmoky
;; enabled packed argb for cmd packet type 3
;;
;; Revision 1.1.1.1.2.2 2005/01/13 16:11:39 koolsmoky
;; prepare for packed rgb
;;
@@ -59,33 +62,32 @@
%include "xos.inc"
;;; Definitions of cvg regs and glide root structures.
%include "fxgasm.h"
extrn _GlideRoot
extrn _FifoMakeRoom
extrn _FifoMakeRoom, 12
%MACRO GR_FIFO_WRITE 3
mov [%1 + %2], %3
%ENDMACRO ; GR_FIFO_WRITE
%ifdef GL_AMD3D
;; 3dnow!
%MACRO WRITE_MM1_FIFO_ALIGNED 0
%ifdef GL_AMD3D
movq [fifo], mm1 ; store current param | previous param
%else
;;
%endif
%ENDMACRO ; WRITE_MM1_FIFO_ALIGNED
%MACRO WRITE_MM1LOW_FIFO 0
%ifdef GL_AMD3D
movd [fifo], mm1 ; store current param | previous param
%else
;;
%endif
%ENDMACRO ; WRITE_MM1LOW_FIFO
%MACRO PROC_TYPE 1
proc %1_3DNow, 12
%ENDM
%else
;; original code
%MACRO PROC_TYPE 1
proc %1, 12
%ENDM
%endif
segment DATA
One DD 1.0
Area DD 0
@@ -94,11 +96,10 @@ segment DATA
bias1 DD 0
%ENDIF
;;; Definitions of cvg regs and glide root structures.
%INCLUDE "fxgasm.h"
;; enables/disables trisProcessed and trisDrawn counters
%define STATS 1
segment CONST
$T2003 DD 12288.0
$T2005 DD 1.0
$T2006 DD 256.0
;;; Arguments (STKOFF = 16 from 4 pushes)
STKOFF equ 16
@@ -113,15 +114,20 @@ _vc$ equ 12 + STKOFF
X equ 0
Y equ 4
segment CONST
T2003 DD 12288.0 ; 12288
T2005 DD 1.0 ; 1
T2006 DD 256.0 ; 256
%MACRO PROC_TYPE 1
%ifdef GL_AMD3D
proc %1_3DNow, 12
%else
proc %1, 12
%endif
%ENDMACRO ; PROC_TYPE
;; enables/disables trisProcessed and trisDrawn counters
%define STATS 1
segment TEXT
ALIGN 32
PROC_TYPE _trisetup_cull
%define GLIDE_CULLING 1
@@ -139,8 +145,7 @@ endp
%IF GLIDE_PACKED_RGB
ALIGN 32
PROC_TYPE _trisetup_cull_rgb
PROC_TYPE _trisetup_cull_rgb
%define GLIDE_CULLING 1
%define GLIDE_PACK_RGB 1
@@ -155,8 +160,7 @@ PROC_TYPE _trisetup_cull_rgb
endp
ALIGN 32
PROC_TYPE _trisetup_cull_argb
PROC_TYPE _trisetup_cull_argb
%define GLIDE_CULLING 1
%define GLIDE_PACK_RGB 1
@@ -170,10 +174,9 @@ PROC_TYPE _trisetup_cull_argb
endp
%ENDIF ; GLIDE_PACKED_RGB
ALIGN 32
PROC_TYPE _trisetup
ALIGN 32
PROC_TYPE _trisetup
%define GLIDE_CULLING 0
%define GLIDE_PACK_RGB 0
@@ -187,11 +190,10 @@ PROC_TYPE _trisetup
endp
%IF GLIDE_PACKED_RGB
%IF GLIDE_PACKED_RGB
ALIGN 32
PROC_TYPE _trisetup_rgb
PROC_TYPE _trisetup_rgb
%define GLIDE_CULLING 0
%define GLIDE_PACK_RGB 1
@@ -206,8 +208,7 @@ PROC_TYPE _trisetup_rgb
endp
ALIGN 32
PROC_TYPE _trisetup_argb
PROC_TYPE _trisetup_argb
%define GLIDE_CULLING 0
%define GLIDE_PACK_RGB 1

View File

@@ -20,6 +20,9 @@
;; $Header$
;; $Revision$
;; $Log$
;; Revision 1.1.1.1.2.3 2005/01/22 14:52:02 koolsmoky
;; enabled packed argb for cmd packet type 3
;;
;; Revision 1.1.1.1.2.2 2005/01/13 16:11:39 koolsmoky
;; prepare for packed rgb
;;
@@ -61,15 +64,14 @@
;; Prologue stuff
push edi ; save caller's register variable
mov gc, [_GlideRoot+curGC]; GR_DCL_GC
push esi ; save caller's register variable
push ebx ; save caller's register variable
push ebp ; save frame pointer
mov gc, [_GlideRoot+curGC]; GR_DCL_GC
mov fa, [esp + _va$] ; get base address of vertex A
push ebx ; save caller's register variable
mov fb, [esp + _vb$] ; get base address of vertex B
push ebp ; save frame pointer
mov cull, [gc + cull_mode]; get cull mode
mov fc, [esp + _vc$] ; get base address of vertex C
@@ -79,13 +81,13 @@
;; Cull Check
movq mm2, [fc + X] ; yc | xc
movq mm2, [fc + x] ; yc | xc
shl cull, 31 ; culltest << 31
movq mm1, [fb + X] ; yb | xb
movq mm1, [fb + x] ; yb | xb
add tempVal, 4 ; space required in fifo
movq mm0, [fa + X] ; ya | xa
movq mm0, [fa + x] ; ya | xa
mov ebx, [gc + fifoRoom] ; space available in fifo
;; Area_Computation
@@ -129,23 +131,24 @@
push tempVal ; fifo space required
call _FifoMakeRoom ; note: updates fifoPtr
add esp, 12 ; remove 3 DWORD arguments from stack
nop ; filler
;add esp, 12 ; remove 3 DWORD arguments from stack
;nop ; filler
%ELSE ; !GLIDE_CULLING
;; Prologue stuff
push edi ; save caller's register variable
mov gc, [_GlideRoot+curGC]; GR_DCL_GC
push esi ; save caller's register variable
push ebx ; save caller's register variable
push ebp ; save frame pointer
mov gc, [_GlideRoot+curGC]; GR_DCL_GC
mov tempVal, [_GlideRoot+curTriSize] ; data for whole triangle in bytes
push ebx ; save caller's register variable
mov ebx, [gc + fifoRoom] ; fifo space available
push ebp ; save frame pointer
add tempVal, 4 ; fifo space needed (include 4-byte header)
femms ; will use AMD3D, clear FPU/MMX registers
cmp ebx, tempVal ; fifo spce available >= space needed ?
jge .__triBegin ; yup, ready to draw triangle
@@ -155,8 +158,8 @@
push tempVal ; fifo space needed
call _FifoMakeRoom ; note: updates fifoPtr
add esp, 12 ; remove 3 DWORD arguments from stack
nop ; filler
;add esp, 12 ; remove 3 DWORD arguments from stack
;nop ; filler
%ENDIF ; GLIDE_CULLING
@@ -164,20 +167,20 @@
%define dlpstrt ecx ; points to begin of dataList structure
%define vertex edx ; the current vertex
ALIGN 32
ALIGN 32
.__triBegin:
mov eax, [gc+triPacketHdr]; Packet 3 header
lea dlp,[gc + tsuDataList]; Reset the dataList
mov fifo, [gc + fifoPtr] ; Fetch Fifo Ptr
mov vertex, [esp + _va$] ; Current vertex = A
mov dlpstrt, dlp ; save pointer to start of dataList
test fifo, 4 ; is fifo pointer qword aligned ?
test fifo, 4 ; is fifo pointer qword aligned ?
jz .__fifo_aligned ; yes, it is qword aligned
movq mm1, [vertex+X] ; y | x
mov eax, [gc+triPacketHdr]; Packet 3 header
movq mm1, [vertex+x] ; y | x
GR_FIFO_WRITE fifo, 0, eax ; write header to fifo; now qword aligned
add fifo, 4 ; advance fifo for hdr; now qword aligned
@@ -258,24 +261,22 @@
;; here: "write buffer" empty
mov eax,[dlp] ; Get first offset from the data list
test eax, eax ; at end of list ?
add dlp, 4 ; dlp++
lea dlp, [dlp+4] ; dlp++
test eax, eax ; at end of list ?
jz .__paramLoopDoneWBzero1; yes, "write buffer" empty
.__paramLoop1a:
movd mm1, [eax+vertex] ; get next parameter
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 4 ; dlp++
test eax, eax ; at end of offset list (offset == 0) ?
jz .__paramLoopDoneWBone1; exit, write buffer contains one DWORD
movd mm2, [eax+vertex] ; get next parameter
add dlp, 8 ; dlp += 2
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 4 ; dlp++
mov eax, [dlp-4] ; offset = *(dlp + 1)
punpckldq mm1, mm2 ; current param | previous param
WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param
@@ -284,20 +285,20 @@
test eax, eax ; at end of offset list (offset == 0) ?
jnz .__paramLoop1a ; nope, copy next parameter
nop ; filler
nop
jmp .__paramLoopDoneWBzero1; write buffer empty
%ENDIF ; GLIDE_PACK_RGB
.__fifo_aligned:
movd mm2, [vertex+X] ; y | x of vertex A
movd mm2, [vertex+x] ; y | x of vertex A
movd mm1, [gc+triPacketHdr]; Packet 3 header
punpckldq mm1, mm2 ; x | header
WRITE_MM1_FIFO_ALIGNED ; PCI write x | header
add fifo, 8 ; fifoPtr += 2*sizeof(FxU32)
movd mm1, [vertex+Y] ; 0 | y of vertex A
movd mm1, [vertex+y] ; 0 | y of vertex A
%IF GLIDE_PACK_RGB
%IF GLIDE_PACK_ALPHA
@@ -314,10 +315,10 @@
punpcklwd mm2, mm4 ; 00000000 | 00rr00bb
psrlq mm4, 24 ; 00000000 | 0000gg00
add dlp, 8 ; skip data list entry "a"
psllq mm3, 24 ; 00000000 | aa000000
por mm4, mm2 ; 00000000 | 00rrggbb
add dlp, 8 ; skip data list entry "a"
por mm4, mm3 ; 00000000 | aarrggbb
%ELSE ; !GLIDE_PACK_ALPHA
;; assumes color values < 256.0
@@ -355,17 +356,15 @@
jz .__paramLoopDoneWBone1 ; exit, write buffer contains one DWORD
movd mm2, [eax+vertex] ; get next parameter
add dlp, 4 ; dlp++
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 4 ; dlp++
add dlp, 8 ; dlp += 8
mov eax, [dlp-4] ; offset = *(dlp + 1)
punpckldq mm1, mm2 ; current param | previous param
test eax, eax ; at end of offset list (offset == 0) ?
WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param
add fifo, 8 ; fifoPtr += 2*sizeof(FxU32)
add fifo, 8 ; fifoPtr += 2*sizeof(FxU32)
jnz .__paramLoop1b ; nope, copy next parameter
nop ; filler
@@ -380,13 +379,10 @@
.__paramLoop1b:
movd mm2, [eax+vertex] ; get next parameter
add fifo, 8 ; fifoPtr += 2*sizeof(FxU32)
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 8 ; dlp += 2
add dlp, 8 ; dlp += 2
punpckldq mm1, mm2 ; current param | previous param
test eax, eax ; at end of offset list (offset == 0) ?
WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param
add fifo, 8 ; fifoPtr += 2*sizeof(FxU32)
@@ -395,9 +391,7 @@
jz .__paramLoopDoneWBzero1; exit, "write buffer" empty
movd mm1, [eax+vertex] ; get next parameter
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 4 ; dlp++
mov eax, [dlp-4] ; offset = *(dlp + 1)
test eax, eax ; at end of offset list (offset == 0) ?
jnz .__paramLoop1b ; nope, copy next parameter
@@ -410,13 +404,13 @@
mov dlp, dlpstrt ; reset the dataList
mov vertex, [esp + _vb$] ; Current vertex = B
movd mm2, [vertex+X] ; 0 | x if vertex B
movd mm2, [vertex+x] ; 0 | x if vertex B
punpckldq mm1, mm2 ; x | old param
WRITE_MM1_FIFO_ALIGNED ; PCI write: x | old param
add fifo, 8 ; fifoPtr += 2*sizeof(FxU32)
movd mm1, [vertex+Y] ; 0 | y of vertex B
movd mm1, [vertex+y] ; 0 | y of vertex B
nop ; filler
%IF GLIDE_PACK_RGB
@@ -502,7 +496,7 @@
movd mm2, [eax+vertex] ; get next parameter
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 4 ; dlp++
add dlp, 8 ; dlp += 2
punpckldq mm1, mm2 ; current param | previous param
WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param
@@ -512,9 +506,7 @@
jz .__paramLoopDoneWBzero2; exit, "write buffer" empty
movd mm1, [eax+vertex] ; get next parameter
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 4 ; dlp++
mov eax, [dlp-4] ; offset = *(dlp + 1)
test eax, eax ; at end of offset list (offset == 0) ?
jnz .__paramLoop2b ; nope, copy next parameter
@@ -522,16 +514,16 @@
jmp .__paramLoopDoneWBone2; write buffer contains one DWORD
%ENDIF
.__paramLoopDoneWBzero1:
mov vertex, [esp + _vb$] ; Current vertex = B
mov dlp, dlpstrt ; Reset the dataList
movq mm1, [vertex+X] ; y | x of vertex B
movq mm1, [vertex+x] ; y | x of vertex B
WRITE_MM1_FIFO_ALIGNED ; PCI write y | x of vertex B
add fifo, 8 ; fifoPtr += 2*sizeof(FxU32)
nop
%IF GLIDE_PACK_RGB
%IF GLIDE_PACK_ALPHA
@@ -571,7 +563,7 @@
;; here: one DWORD in "write buffer", RGB(A)
mov eax, dword [dlp] ; get first offset from the data list
mov eax, [dlp] ; get first offset from the data list
add dlp, 4 ; dlp++
test eax, eax ; end of list ?
@@ -615,15 +607,13 @@
movd mm1, [eax+vertex] ; get next parameter
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 4 ; dlp++
test eax, eax ; at end of offset list (offset == 0) ?
jz .__paramLoopDoneWBone2; exit, write buffer contains one DWORD
jz .__paramLoopDoneWBone2 ; exit, write buffer contains one DWORD
movd mm2, [eax+vertex] ; get next parameter
add dlp, 8 ; dlp++
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 4 ; dlp++
mov eax, [dlp-4] ; offset = *(dlp + 1)
punpckldq mm1, mm2 ; current param | previous param
WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param
@@ -634,16 +624,16 @@
%ENDIF ; GLIDE_PACK_RGB
.__paramLoopDoneWBzero2:
mov vertex, [esp + _vc$] ; Current vertex = C
mov dlp, dlpstrt ; Reset the dataList
movq mm1, [vertex+X] ; y | x of vertex C
movq mm1, [vertex+x] ; y | x of vertex C
WRITE_MM1_FIFO_ALIGNED ; PCI write y | x of vertex C
add fifo, 8 ; fifoPtr += 2*sizeof(FxU32)
nop
%IF GLIDE_PACK_RGB
%IF GLIDE_PACK_ALPHA
@@ -727,15 +717,13 @@
movd mm1, [eax+vertex] ; get next parameter
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 4 ; dlp++
test eax, eax ; at end of offset list (offset == 0) ?
jz .__paramLoopDoneWBone3; exit, write buffer contains one DWORD
movd mm2, [eax+vertex] ; get next parameter
add dlp, 8 ; dlp += 2
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 4 ; dlp++
mov eax, [dlp-4] ; offset = *(dlp + 1)
punpckldq mm1, mm2 ; current param | previous param
WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param
@@ -748,7 +736,6 @@
%ENDIF ; GLIDE_PACK_RGB
.__paramLoopDoneWBone2:
;; here: "write buffer" has one DWORD left over from vertex B
@@ -756,13 +743,14 @@
mov vertex, [esp + _vc$] ; Current vertex = C
mov dlp, dlpstrt ; reset the dataList
movd mm2, [vertex+X] ; 0 | x if vertex C
movd mm2, [vertex+x] ; 0 | x if vertex C
punpckldq mm1, mm2 ; x | old param
WRITE_MM1_FIFO_ALIGNED ; PCI write: x | old param
add fifo, 8 ; fifoPtr += 2*sizeof(FxU32)
movd mm1, [vertex+Y] ; 0 | y of vertex C
movd mm1, [vertex+y] ; 0 | y of vertex C
nop
%IF GLIDE_PACK_RGB
%IF GLIDE_PACK_ALPHA
@@ -849,7 +837,7 @@
mov eax, [dlp] ; offset = *(dlp + 1)
punpckldq mm1, mm2 ; current param | previous param
add dlp, 4 ; dlp++
add dlp, 8 ; dlp += 2
WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param
add fifo, 8 ; fifoPtr += 2*sizeof(FxU32)
@@ -858,9 +846,7 @@
jz .__paramLoopDoneWBzero3; exit, "write buffer" empty
movd mm1, [eax+vertex] ; get next parameter
mov eax, [dlp] ; offset = *(dlp + 1)
add dlp, 4 ; dlp++
mov eax, [dlp-4] ; offset = *(dlp + 1)
test eax, eax ; at end of offset list (offset == 0) ?
jnz .__paramLoop3b ; nope, copy next parameter
@@ -884,7 +870,7 @@
mov [gc + fifoPtr], fifo ; save new fifo pointer
mov edx, [gc + fifoRoom] ; old fifo space available
inc ecx ; _GlideRoot.stats.trisDrawn++
add ecx, 1 ; _GlideRoot.stats.trisDrawn++
mov ebp, [_GlideRoot + trisProcessed]; _GlideRoot.stats.trisProcessed
sub eax, ebx ; new fifo ptr - old fifo ptr = additional fifo space used
@@ -895,45 +881,43 @@
mov eax, 1h ; return value = triangle drawn
mov [gc + fifoRoom], edx ; new fifo space available
;; Restore trashed registers
inc ebp ; _GlideRoot.stats.trisProcessed++
pop esi ; restore caller's register variable
add ebp, 1 ; _GlideRoot.stats.trisProcessed++
mov [_GlideRoot + trisProcessed], ebp ;
pop ebx ; restore caller's register variable
pop ebp ; restore frame pointer
pop edi ; restore caller's register variable
femms ; no more AMD3D code, clear FPU/MMX regs
ret ; return to caller
;; Restore trashed registers
pop ebp ; restore frame pointer
pop ebx ; restore caller's register variable
pop esi ; restore caller's register variable
pop edi ; restore caller's register variable
ret ; return to caller
%IF GLIDE_CULLING
.__cullFail:
mov ebp, [_GlideRoot + trisProcessed] ; triangles processed so far
xor eax, eax ; return value = triangle not drawn
add ebp, 1 ; _GlideRoot.stats.trisProcessed++;
mov [_GlideRoot + trisProcessed], ebp
femms ; no more AMD3D code, clear FPU/MMX regs
;; Restore trashed registers
inc ebp ; _GlideRoot.stats.trisProcessed++;
pop esi
mov [_GlideRoot + trisProcessed], ebp
pop ebx
;; Restore trashed registers
pop ebp ; restore frame pointer
pop edi
pop ebx ; restore caller's register variable
pop esi ; restore caller's register variable
pop edi ; restore caller's register variable
ret
%ENDIF ; GLIDE_CULLING
;;--------------------------------------------------------------------------
;; end AMD3D version
;;--------------------------------------------------------------------------
%else
;;--------------------------------------------------------------------------
;; start original code
;;--------------------------------------------------------------------------
@@ -950,7 +934,7 @@
push ebp
nop
align 4
ALIGN 32
%IF GLIDE_CULLING
%define fa eax ; vtx a from caller
%define fb ebx ; vtx b from caller
@@ -1014,7 +998,7 @@
%ENDIF ; GLIDE_CULLING
align 4
ALIGN 32
;; Check to make sure that we have enough room for
;; the complete triangle packet.
mov eax, [_GlideRoot + curTriSize]
@@ -1031,7 +1015,7 @@
push eax
call _FifoMakeRoom
add esp, 12
;add esp, 12
;; Send triangle parameters
@@ -1043,7 +1027,7 @@
%define packCol edi
%define tempVal edi
align 4
ALIGN 32
.__triBegin:
mov fifo, [gc + fifoPtr] ; Fetch Fifo Ptr
mov vOffset, 4 ; Starting vertex
@@ -1054,7 +1038,7 @@
GR_FIFO_WRITE fifo, 0, eax ; Write packet header to fifo
add fifo, 4 ; Advance fifo for hdr & x/y coordinate
align 4
ALIGN 32
.__vertexStart:
mov vertex, [esp + STKOFF + vOffset] ; Current vertex
add fifo, 8
@@ -1062,11 +1046,11 @@
nop ; Avoid p5 agi w/ load of vertex ptr
nop
mov eax, [vertex + X] ; X
mov eax, [vertex + x] ; X
lea dlp, [gc + tsuDataList] ; Reset the dataList
GR_FIFO_WRITE fifo, -8, eax ; PCI write X
mov eax, [vertex + Y] ; Y
mov eax, [vertex + y] ; Y
xor packCol, packCol ; Clear packed color
GR_FIFO_WRITE fifo, -4, eax ; PCI write Y
@@ -1159,7 +1143,7 @@
jne .__paramLoop
align 4
ALIGN 32
.__nextVertex:
;; On to the next vertex
add vOffset, 4

View File

@@ -19,6 +19,10 @@
;; $Header$
;; $Revision$
;; $Log$
;; Revision 1.1.2.1 2004/12/23 20:45:56 koolsmoky
;; converted to nasm syntax
;; added x86 asm, 3dnow! triangle and mmx, 3dnow! texture download optimizations
;;
;; Revision 1.1.1.1.8.1 2003/11/03 13:34:30 dborca
;; Voodoo2 happiness (DJGPP & Linux)
;;
@@ -74,14 +78,14 @@
%include "xos.inc"
extrn _FifoMakeRoom
extrn _FifoMakeRoom, 12
%MACRO _grCommandTransportMakeRoom 3
push %3
push %2
push %1
call _FifoMakeRoom
add esp, 12
;add esp, 12
%ENDMACRO ; _grCommandTransportMakeRoom
;;; Definitions of cvg regs and glide root structures.
@@ -107,19 +111,15 @@ _texData$ equ 24 + STACKOFFSET
%define curS ecx ; texture s-coordinate
%define fRoom edx ; room available in fifo (in bytes)
;--------------------------------------------------------------------------
%IFNDEF GL_SSE2
;--------------------------------------------------------------------------
;
; GL_AMD3D, GL_MMX
;
;--------------------------------------------------------------------------
segment TEXT
segment TEXT
ALIGN 32
ALIGN 32
%IFDEF GL_AMD3D
proc _grTexDownload_3DNow_MMX, 24
@@ -156,7 +156,7 @@ proc _grTexDownload_MMX, 24
sub curT, eax ; curT = maxT - minT
mov fifo, [gc + fifoPtr] ; fifoPtr
mov curS, [esp + _maxS$] ; curS = maxS
mov curS, [esp + _maxS$] ; curS = maxS = scanline width in DWORDs
add curT, 1 ; curT = maxT - minT + 1
%IFDEF GL_AMD3D
@@ -166,23 +166,24 @@ proc _grTexDownload_MMX, 24
emms ; we'll use MMX
%ENDIF
mov edx, curS ; curS = maxS = scanline width in DWORDs
movd mm3, [esp + _baseAddr$] ; 0 | address of texture to download
shl curS, 2 ; scan line width (in bytes)
add curS, curS ;
add curS, curS ; scan line width (in bytes)
mov eax, [esp + _minT$] ; 0 | minT
mov [esp + _maxS$], curS ; save scan line width (in bytes)
shl edx, 3 ; packetHdr<21:3> = maxS = scanline width in DWORDs
mov edx, curS ;
shl eax, 9 ; TEX_ROW_ADDR_INCR(minT) = minT << 9
add edx, edx ; packetHdr<21:3> = maxS = scanline width in DWORDs
or edx, 0xc0000005 ; packetHdr<31:30> = texture port
; packetHdr<21:3> = maxS
; packetHdr<2:0> = packetType 5
movd mm1, edx ; 0 | packetHdr
movd mm2, eax ; 0 | TEX_ROW_ADDR_INCR(minT)
movd mm2, eax ; 0 | minT
psllq mm2, 9 ; 0 | TEX_ROW_ADDR_INCR(minT) = minT << 9
paddd mm3, mm2 ; 0 | texAddr = texBaseAddr + TEX_ROW_ADDR_INCR(minT)
movd mm2, [gc + tex_ptr] ; 0 | gc->tex_ptr
@@ -230,7 +231,7 @@ proc _grTexDownload_MMX, 24
mov [gc + fifoPtr], fifo ; store new fifoPtr
jmp .startDownload ; fifo aligned, download texture now
align 32
ALIGN 32
;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = maxS = curS
;; edx=fifoRoom, mm1 = texAddr-gc->tex_ptr|packetHdr, mm2 = TEX_ROW_ADDR_INCR(1)|0
@@ -327,394 +328,3 @@ proc _grTexDownload_MMX, 24
ret ; pop 6 DWORD parameters and return
endp
%ELSE ; !GL_SSE2
;--------------------------------------------------------------------------
;
; GL_SSE2
;
;--------------------------------------------------------------------------
segment TEXT
ALIGN 32
proc _grTexDownload_SSE2_64, 24
push ebx ; save caller's register variable
mov curT, [esp + _maxT$ - 12] ; curT = maxT
push esi ; save caller's register variable
mov eax, [esp + _minT$ - 8] ; minT
push edi ; save caller's register variable
mov gc, [esp + _gc$ - 4] ; gc
push ebp ; save caller's register variable
mov dataPtr, [esp + _texData$]; dataPtr
%IFDEF GLIDE_ALT_TAB
test gc, gc
je .dlDone
; mov edx, [gc + windowed]
; test edx, 1
; jnz .pastContextTest
mov edx, DWORD [gc+lostContext]
mov ecx, [edx]
test ecx, 1
jnz .dlDone
;.pastContextTest:
%ENDIF
sub curT, eax ; curT = maxT - minT
mov fifo, [gc + fifoPtr] ; fifoPtr
mov curS, [esp + _maxS$] ; curS = maxS
add curT, 1 ; curT = maxT - minT + 1
mov edx, curS ; curS = maxS = scanline width in DWORDs
movd xmm3,[esp + _baseAddr$] ; 0 | 0 | 0 | address of texture to download
shl curS, 2 ; scan line width (in bytes)
mov eax, [esp + _minT$] ; 0 | 0 | 0 | minT
mov [esp + _maxS$], curS ; save scan line width (in bytes)
shl edx, 3 ; packetHdr<21:3> = maxS = scanline width in DWORDs
imul eax, curS ; TEX_ROW_ADDR_INCR(minT) = minT * TEX_ROW_ADDR_INCR(1)
movd xmm2,curS ; 0 | 0 | TEX_ROW_ADDR_INCR(1)
or edx, 00000005h ; packetHdr<31:30> = lfb port
; packetHdr<21:3> = maxS
; packetHdr<2:0> = packetType 5
movd xmm1,edx ; 0 | 0 | packetHdr
movd xmm4,eax ; 0 | 0 | TEX_ROW_ADDR_INCR(minT)
psllq xmm2,32 ; 0 | 0 | TEX_ROW_ADDR_INCR(1) | 0
paddd xmm3,xmm4 ; 0 | 0 | texAddr = texBaseAddr + TEX_ROW_ADDR_INCR(minT)
mov fRoom, [gc + fifoRoom] ; get available fifoRoom (in bytes)
punpckldq xmm1,xmm3 ; 0 | 0 | hdr2 = texAddr | hdr1 = packetHdr
;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = curS = maxS
;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
test fifo, 4 ; is fifo QWORD aligned ?
jz .startDownload ; yup, start texture download
cmp fRoom, 4 ; enough room for NULL packet in fifo?
jge .xmmAlignFifo ; yes, write NULL packet to align fifo
%ifdef USE_PACKET_FIFO
_grCommandTransportMakeRoom 4, 0, __LINE__; make fifo room
%endif
mov fifo, [gc + fifoPtr] ; fifoPtr modified by _grCommandTransportMakeRoom, reload
mov fRoom, [gc + fifoRoom] ; fifoRoom modified by _grCommandTransportMakeRoom, reload
mov curS, [esp + _maxS$] ; reload maxS (destroyed by call to _grCommandTransportMakeRoom)
test fifo, 4 ; new fifoPtr QWORD aligned ?
jz .startDownload ; yup, start texture download
.xmmAlignFifo:
mov DWORD [fifo], 0 ; write NULL packet
sub fRoom, 4 ; fifoRoom -= 4
mov [gc + fifoRoom], fRoom ; store new fifoRoom
add fifo, 4 ; fifoPtr += 4
%IFDEF GLIDE_DEBUG
mov [gc + checkPtr], fifo ; checkPtr
%ENDIF
mov [gc + fifoPtr], fifo ; store new fifoPtr
jmp .startDownload ; fifo aligned, download texture now
align 32
;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = maxS = curS
;; edx=fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
.loopT:
%IFDEF GLIDE_DEBUG
;; Make sure that we have a QWORD aligned fifoPtr; force GP if not aligned
test fifo, 4 ; is fifoPtr QWORD aligned ?
jz .alignmentOK ; yup, continue
xor eax, eax ; create 0
mov [eax], eax ; move to DS:[0] forces GP
.alignmentOK:
%ENDIF ; GLIDE_DEBUG
;; Compute packet header words
;; hdr1: downloadSpace[31:30] numWords[21:3] packetType[2:0]
;; hdr2: download address[29:0]
movq [fifo],xmm1 ; store hdr2 | hdr1
add fifo, 8 ; increment fifo ptr (hdr1 + hdr2)
;; S coordinate inner loop unrolled for 8 texels a write
.loopS:
movq xmm0,[dataPtr] ; load 64 bit data (8 texels)
add fifo, 8 ; pre-increment fifoPtr += 2 * sizeof(FxU32)
add dataPtr, 8 ; dataPtr += 2 * sizeof(FxU32)
sub curS, 8 ; curS -= 2 * sizeof(FxU32)
movq [fifo - 8],xmm0 ; *fifoPtr = texelData[64 bits]
jnz .loopS ; loop while curS > 0
mov ecx, [gc + fifoPtr] ; old fifo ptr
nop ; filler
mov eax, fifo ; new fifo ptr
mov [gc + fifoPtr], fifo ; save new fifo ptr
%IFDEF GLIDE_DEBUG
mov [gc + checkPtr], fifo ; checkPtr
%ENDIF
sub eax, ecx ; new fifo ptr - old fifo ptr = fifo space used up
mov curS, [esp + _maxS$] ; curS = maxS = width of scanline (bytes)
sub fRoom, eax ; new fifo space available = old fifo space available - fifo space used up = new fifo space available
sub curT, 1 ; curT--
mov [gc + fifoRoom], fRoom ; save new fifo space available
jz .dlDone ; loop while curT > 0
;; Check for room to write the next texture scanline
;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo
;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
paddd xmm1,xmm2 ; 0 | 0 | texAddr+=TEX_ROW_ADDR_INCR(1) | packetHdr
mov esp, esp ; filler
.startDownload:
lea eax, [curS+8] ; fifo space needed = scan line width + header size
cmp fRoom, eax ; fifo space available >= fifo space required ?
jge .loopT ; yup, write next scan line
%ifdef USE_PACKET_FIFO
_grCommandTransportMakeRoom eax, 0, __LINE__; make fifo room (if fifoPtr QWORD aligned before
%endif
mov fifo, [gc + fifoPtr] ; fifoPtr was modified by _grCommandTransportMakeRoom, reload
mov fRoom, [gc + fifoRoom] ; fifoRoom was modified by _grCommandTransportMakeRoom, reload
mov curS, [esp + _maxS$] ; curS = maxS = width of scanline (bytes)
jmp .loopT ; we now have enough fifo room, write next scanline
.dlDone:
pop ebp ; restore caller's register variable
pop edi ; restore caller's register variable
pop esi ; restore caller's register variable
pop ebx ; restore caller's register variable
ret ; pop 6 DWORD parameters and return
endp
segment TEXT
ALIGN 32
proc _grTexDownload_SSE2_128, 24
push ebx ; save caller's register variable
mov curT, [esp + _maxT$ - 12] ; curT = maxT
push esi ; save caller's register variable
mov eax, [esp + _minT$ - 8] ; minT
push edi ; save caller's register variable
mov gc, [esp + _gc$ - 4] ; gc
push ebp ; save caller's register variable
mov dataPtr, [esp + _texData$]; dataPtr
%IFDEF GLIDE_ALT_TAB
test gc, gc
je .dlDone
; mov edx, [gc + windowed]
; test edx, 1
; jnz .pastContextTest
mov edx, DWORD [gc+lostContext]
mov ecx, [edx]
test ecx, 1
jnz .dlDone
;.pastContextTest:
%ENDIF
sub curT, eax ; curT = maxT - minT
mov fifo, [gc + fifoPtr] ; fifoPtr
mov curS, [esp + _maxS$] ; curS = maxS
add curT, 1 ; curT = maxT - minT + 1
mov edx, curS ; curS = maxS = scanline width in DWORDs
movd xmm3,[esp + _baseAddr$] ; 0 | 0 | 0 | address of texture to download
shl curS, 2 ; scan line width (in bytes)
mov eax, [esp + _minT$] ; 0 | minT
mov [esp + _maxS$], curS ; save scan line width (in bytes)
shl edx, 3 ; packetHdr<21:3> = maxS = scanline width in DWORDs
imul eax, curS ; TEX_ROW_ADDR_INCR(minT) = minT * TEX_ROW_ADDR_INCR(1)
movd xmm2,curS ; 0 | 0 | 0 | TEX_ROW_ADDR_INCR(1)
or edx, 00000005h ; packetHdr<31:30> = lfb port
; packetHdr<21:3> = maxS
; packetHdr<2:0> = packetType 5
movd xmm1,edx ; 0 | 0 | 0 | packetHdr
movd xmm4,eax ; 0 | 0 | 0 | TEX_ROW_ADDR_INCR(minT)
psllq xmm2,32 ; 0 | 0 | TEX_ROW_ADDR_INCR(1) | 0
paddd xmm3,xmm4 ; 0 | 0 | 0 | texAddr = texBaseAddr + TEX_ROW_ADDR_INCR(minT)
mov fRoom, [gc + fifoRoom] ; get available fifoRoom (in bytes)
punpckldq xmm1,xmm3 ; 0 | 0 | hdr2 = texAddr | hdr1 = packetHdr
;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = curS = maxS
;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
test fifo, 4 ; is fifo QWORD aligned ?
jz .startDownload ; yup, start texture download
cmp fRoom, 4 ; enough room for NULL packet in fifo?
jge .xmmAlignFifo ; yes, write NULL packet to align fifo
%ifdef USE_PACKET_FIFO
_grCommandTransportMakeRoom 4, 0, __LINE__; make fifo room
%endif
mov fifo, [gc + fifoPtr] ; fifoPtr modified by _grCommandTransportMakeRoom, reload
mov fRoom, [gc + fifoRoom] ; fifoRoom modified by _grCommandTransportMakeRoom, reload
mov curS, [esp + _maxS$] ; reload maxS (destroyed by call to _grCommandTransportMakeRoom)
test fifo, 4 ; new fifoPtr QWORD aligned ?
jz .startDownload ; yup, start texture download
.xmmAlignFifo:
mov DWORD [fifo], 0 ; write NULL packet
sub fRoom, 4 ; fifoRoom -= 4
mov [gc + fifoRoom], fRoom ; store new fifoRoom
add fifo, 4 ; fifoPtr += 4
%IFDEF GLIDE_DEBUG
mov [gc + checkPtr], fifo ; checkPtr
%ENDIF
mov [gc + fifoPtr], fifo ; store new fifoPtr
jmp .startDownload ; fifo aligned, download texture now
align 32
;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = maxS = curS
;; edx=fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
.loopT:
%IFDEF GLIDE_DEBUG
;; Make sure that we have a QWORD aligned fifoPtr; force GP if not aligned
test fifo, 4 ; is fifoPtr QWORD aligned ?
jz .alignmentOK ; yup, continue
xor eax, eax ; create 0
mov [eax], eax ; move to DS:[0] forces GP
.alignmentOK:
%ENDIF ; GLIDE_DEBUG
;; Compute packet header words
;; hdr1: downloadSpace[31:30] numWords[21:3] packetType[2:0]
;; hdr2: download address[29:0]
movq [fifo],xmm1 ; store hdr2 | hdr1
add fifo, 8 ; increment fifo ptr (hdr1 + hdr2)
;; S coordinate inner loop unrolled for 8 texels a write
.loopS:
movdqu xmm0, [dataPtr] ; load 128 bit data (8 texels) ; isn't 16 bytes aligned?
add fifo, 16 ; pre-increment fifoPtr += 4 * sizeof(FxU32)
add dataPtr, 16 ; dataPtr += 4 * sizeof(FxU32)
sub curS, 16 ; curS -= 4 * sizeof(FxU32)
movdqu [fifo - 16], xmm0 ; *fifoPtr = texelData[128 bits] ; isn't 16 bytes aligned?
jnz .loopS ; loop while curS > 0
mov ecx, [gc + fifoPtr] ; old fifo ptr
nop ; filler
mov eax, fifo ; new fifo ptr
mov [gc + fifoPtr], fifo ; save new fifo ptr
%IFDEF GLIDE_DEBUG
mov [gc + checkPtr], fifo ; checkPtr
%ENDIF
sub eax, ecx ; new fifo ptr - old fifo ptr = fifo space used up
mov curS, [esp + _maxS$] ; curS = maxS = width of scanline (bytes)
sub fRoom, eax ; new fifo space available = old fifo space available - fifo space used up = new fifo space available
sub curT, 1 ; curT--
mov [gc + fifoRoom], fRoom ; save new fifo space available
jz .dlDone ; loop while curT > 0
;; Check for room to write the next texture scanline
;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo
;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
paddd xmm1,xmm2 ; 0 | 0 | texAddr+=TEX_ROW_ADDR_INCR(1) | packetHdr
mov esp, esp ; filler
.startDownload:
lea eax, [curS+8] ; fifo space needed = scan line width + header size
cmp fRoom, eax ; fifo space available >= fifo space required ?
jge .loopT ; yup, write next scan line
%ifdef USE_PACKET_FIFO
_grCommandTransportMakeRoom eax, 0, __LINE__; make fifo room (if fifoPtr QWORD aligned before
%endif
mov fifo, [gc + fifoPtr] ; fifoPtr was modified by _grCommandTransportMakeRoom, reload
mov fRoom, [gc + fifoRoom] ; fifoRoom was modified by _grCommandTransportMakeRoom, reload
mov curS, [esp + _maxS$] ; curS = maxS = width of scanline (bytes)
jmp .loopT ; we now have enough fifo room, write next scanline
.dlDone:
pop ebp ; restore caller's register variable
pop edi ; restore caller's register variable
pop esi ; restore caller's register variable
pop ebx ; restore caller's register variable
ret ; pop 6 DWORD parameters and return
endp
%ENDIF ; GL_SSE2