From 74da7049a308a67a31c83fba1844b49819e835bb Mon Sep 17 00:00:00 2001 From: koolsmoky <> Date: Sat, 23 Apr 2005 18:36:36 +0000 Subject: [PATCH] fixed 3dnow! and mmx optimizations --- glide2x/cvg/glide/src/Makefile.win32 | 26 +- glide2x/cvg/glide/src/xdraw2.asm | 71 ++--- glide2x/cvg/glide/src/xdraw2.inc | 192 ++++++------ glide2x/cvg/glide/src/xtexdl.asm | 422 +-------------------------- 4 files changed, 154 insertions(+), 557 deletions(-) diff --git a/glide2x/cvg/glide/src/Makefile.win32 b/glide2x/cvg/glide/src/Makefile.win32 index 7d688ed..ee98d4c 100644 --- a/glide2x/cvg/glide/src/Makefile.win32 +++ b/glide2x/cvg/glide/src/Makefile.win32 @@ -29,6 +29,8 @@ # capabilities are still checked at run-time to avoid # crashes. # default = no +# USE_MMX=1 allow MMX specializations. +# default = no # TEXUS2=1 embed Texus2 functions into Glide2. # default = no # FXOEM2X=1 build fxoem2x.dll @@ -103,18 +105,7 @@ CDEFS += -DGDBG_INFO_ON -DGLIDE_DEBUG -DGLIDE_SANITY_ASSERT -DGLIDE_SANITY_SIZE endif override USE_FIFO = 1 -override USE_X86 = 1 - -# cpu optimized triangle -ifeq ($(USE_MMX),1) -CFLAGS += -DGL_MMX -override USE_X86 = 1 -endif - -ifeq ($(USE_3DNOW),1) -CFLAGS += -DGL_AMD3D -override USE_X86 = 1 -endif +#override USE_X86 = 1 ifeq ($(USE_X86),1) CDEFS += -DGLIDE_DISPATCH_SETUP=1 -DGLIDE_DISPATCH_DOWNLOAD=1 @@ -165,6 +156,17 @@ CFLAGS += -I$(FX_GLIDE_SW)/fxmisc -I$(FX_GLIDE_SW)/newpci/pcilib -I$(FX_GLIDE_SW CFLAGS += -I$(FX_GLIDE_SW)/texus2/lib CFLAGS += $(CDEFS) +# cpu optimized triangle +ifeq ($(USE_MMX),1) +CFLAGS += -DGL_MMX +override USE_X86 = 1 +endif + +ifeq ($(USE_3DNOW),1) +CFLAGS += -DGL_AMD3D +override USE_X86 = 1 +endif + ############################################################################### # objects ############################################################################### diff --git a/glide2x/cvg/glide/src/xdraw2.asm b/glide2x/cvg/glide/src/xdraw2.asm index 801ee0e..8441a19 100644 --- a/glide2x/cvg/glide/src/xdraw2.asm +++ b/glide2x/cvg/glide/src/xdraw2.asm @@ -19,6 +19,9 @@ ;; $Header$ ;; $Revision$ ;; $Log$ +;; Revision 1.1.1.1.2.3 2005/01/22 14:52:02 koolsmoky +;; enabled packed argb for cmd packet type 3 +;; ;; Revision 1.1.1.1.2.2 2005/01/13 16:11:39 koolsmoky ;; prepare for packed rgb ;; @@ -59,33 +62,32 @@ %include "xos.inc" +;;; Definitions of cvg regs and glide root structures. +%include "fxgasm.h" + extrn _GlideRoot -extrn _FifoMakeRoom +extrn _FifoMakeRoom, 12 %MACRO GR_FIFO_WRITE 3 mov [%1 + %2], %3 %ENDMACRO ; GR_FIFO_WRITE -%ifdef GL_AMD3D -;; 3dnow! %MACRO WRITE_MM1_FIFO_ALIGNED 0 + %ifdef GL_AMD3D movq [fifo], mm1 ; store current param | previous param + %else + ;; + %endif %ENDMACRO ; WRITE_MM1_FIFO_ALIGNED %MACRO WRITE_MM1LOW_FIFO 0 + %ifdef GL_AMD3D movd [fifo], mm1 ; store current param | previous param + %else + ;; + %endif %ENDMACRO ; WRITE_MM1LOW_FIFO -%MACRO PROC_TYPE 1 - proc %1_3DNow, 12 -%ENDM -%else -;; original code -%MACRO PROC_TYPE 1 - proc %1, 12 -%ENDM -%endif - segment DATA One DD 1.0 Area DD 0 @@ -94,11 +96,10 @@ segment DATA bias1 DD 0 %ENDIF -;;; Definitions of cvg regs and glide root structures. -%INCLUDE "fxgasm.h" - -;; enables/disables trisProcessed and trisDrawn counters -%define STATS 1 +segment CONST +$T2003 DD 12288.0 +$T2005 DD 1.0 +$T2006 DD 256.0 ;;; Arguments (STKOFF = 16 from 4 pushes) STKOFF equ 16 @@ -113,15 +114,20 @@ _vc$ equ 12 + STKOFF X equ 0 Y equ 4 -segment CONST -T2003 DD 12288.0 ; 12288 -T2005 DD 1.0 ; 1 -T2006 DD 256.0 ; 256 +%MACRO PROC_TYPE 1 + %ifdef GL_AMD3D + proc %1_3DNow, 12 + %else + proc %1, 12 + %endif +%ENDMACRO ; PROC_TYPE + +;; enables/disables trisProcessed and trisDrawn counters +%define STATS 1 segment TEXT ALIGN 32 - PROC_TYPE _trisetup_cull %define GLIDE_CULLING 1 @@ -139,8 +145,7 @@ endp %IF GLIDE_PACKED_RGB ALIGN 32 - -PROC_TYPE _trisetup_cull_rgb +PROC_TYPE _trisetup_cull_rgb %define GLIDE_CULLING 1 %define GLIDE_PACK_RGB 1 @@ -155,8 +160,7 @@ PROC_TYPE _trisetup_cull_rgb endp ALIGN 32 - -PROC_TYPE _trisetup_cull_argb +PROC_TYPE _trisetup_cull_argb %define GLIDE_CULLING 1 %define GLIDE_PACK_RGB 1 @@ -170,10 +174,9 @@ PROC_TYPE _trisetup_cull_argb endp %ENDIF ; GLIDE_PACKED_RGB - - ALIGN 32 -PROC_TYPE _trisetup + ALIGN 32 +PROC_TYPE _trisetup %define GLIDE_CULLING 0 %define GLIDE_PACK_RGB 0 @@ -187,11 +190,10 @@ PROC_TYPE _trisetup endp -%IF GLIDE_PACKED_RGB +%IF GLIDE_PACKED_RGB ALIGN 32 - -PROC_TYPE _trisetup_rgb +PROC_TYPE _trisetup_rgb %define GLIDE_CULLING 0 %define GLIDE_PACK_RGB 1 @@ -206,8 +208,7 @@ PROC_TYPE _trisetup_rgb endp ALIGN 32 - -PROC_TYPE _trisetup_argb +PROC_TYPE _trisetup_argb %define GLIDE_CULLING 0 %define GLIDE_PACK_RGB 1 diff --git a/glide2x/cvg/glide/src/xdraw2.inc b/glide2x/cvg/glide/src/xdraw2.inc index 27620a8..1ed5b98 100644 --- a/glide2x/cvg/glide/src/xdraw2.inc +++ b/glide2x/cvg/glide/src/xdraw2.inc @@ -20,6 +20,9 @@ ;; $Header$ ;; $Revision$ ;; $Log$ +;; Revision 1.1.1.1.2.3 2005/01/22 14:52:02 koolsmoky +;; enabled packed argb for cmd packet type 3 +;; ;; Revision 1.1.1.1.2.2 2005/01/13 16:11:39 koolsmoky ;; prepare for packed rgb ;; @@ -61,15 +64,14 @@ ;; Prologue stuff push edi ; save caller's register variable - mov gc, [_GlideRoot+curGC]; GR_DCL_GC - push esi ; save caller's register variable + push ebx ; save caller's register variable + push ebp ; save frame pointer + + mov gc, [_GlideRoot+curGC]; GR_DCL_GC mov fa, [esp + _va$] ; get base address of vertex A - push ebx ; save caller's register variable mov fb, [esp + _vb$] ; get base address of vertex B - - push ebp ; save frame pointer mov cull, [gc + cull_mode]; get cull mode mov fc, [esp + _vc$] ; get base address of vertex C @@ -79,13 +81,13 @@ ;; Cull Check - movq mm2, [fc + X] ; yc | xc + movq mm2, [fc + x] ; yc | xc shl cull, 31 ; culltest << 31 - movq mm1, [fb + X] ; yb | xb + movq mm1, [fb + x] ; yb | xb add tempVal, 4 ; space required in fifo - movq mm0, [fa + X] ; ya | xa + movq mm0, [fa + x] ; ya | xa mov ebx, [gc + fifoRoom] ; space available in fifo ;; Area_Computation @@ -129,23 +131,24 @@ push tempVal ; fifo space required call _FifoMakeRoom ; note: updates fifoPtr - add esp, 12 ; remove 3 DWORD arguments from stack - nop ; filler - + ;add esp, 12 ; remove 3 DWORD arguments from stack + ;nop ; filler %ELSE ; !GLIDE_CULLING + ;; Prologue stuff push edi ; save caller's register variable - mov gc, [_GlideRoot+curGC]; GR_DCL_GC - push esi ; save caller's register variable + push ebx ; save caller's register variable + push ebp ; save frame pointer + + mov gc, [_GlideRoot+curGC]; GR_DCL_GC mov tempVal, [_GlideRoot+curTriSize] ; data for whole triangle in bytes - push ebx ; save caller's register variable mov ebx, [gc + fifoRoom] ; fifo space available - - push ebp ; save frame pointer add tempVal, 4 ; fifo space needed (include 4-byte header) + femms ; will use AMD3D, clear FPU/MMX registers + cmp ebx, tempVal ; fifo spce available >= space needed ? jge .__triBegin ; yup, ready to draw triangle @@ -155,8 +158,8 @@ push tempVal ; fifo space needed call _FifoMakeRoom ; note: updates fifoPtr - add esp, 12 ; remove 3 DWORD arguments from stack - nop ; filler + ;add esp, 12 ; remove 3 DWORD arguments from stack + ;nop ; filler %ENDIF ; GLIDE_CULLING @@ -164,20 +167,20 @@ %define dlpstrt ecx ; points to begin of dataList structure %define vertex edx ; the current vertex - ALIGN 32 + ALIGN 32 .__triBegin: - mov eax, [gc+triPacketHdr]; Packet 3 header lea dlp,[gc + tsuDataList]; Reset the dataList - mov fifo, [gc + fifoPtr] ; Fetch Fifo Ptr + mov vertex, [esp + _va$] ; Current vertex = A - mov dlpstrt, dlp ; save pointer to start of dataList - test fifo, 4 ; is fifo pointer qword aligned ? + test fifo, 4 ; is fifo pointer qword aligned ? jz .__fifo_aligned ; yes, it is qword aligned - movq mm1, [vertex+X] ; y | x + + mov eax, [gc+triPacketHdr]; Packet 3 header + movq mm1, [vertex+x] ; y | x GR_FIFO_WRITE fifo, 0, eax ; write header to fifo; now qword aligned add fifo, 4 ; advance fifo for hdr; now qword aligned @@ -258,24 +261,22 @@ ;; here: "write buffer" empty mov eax,[dlp] ; Get first offset from the data list - test eax, eax ; at end of list ? + add dlp, 4 ; dlp++ - lea dlp, [dlp+4] ; dlp++ + test eax, eax ; at end of list ? jz .__paramLoopDoneWBzero1; yes, "write buffer" empty .__paramLoop1a: movd mm1, [eax+vertex] ; get next parameter mov eax, [dlp] ; offset = *(dlp + 1) - add dlp, 4 ; dlp++ test eax, eax ; at end of offset list (offset == 0) ? - jz .__paramLoopDoneWBone1; exit, write buffer contains one DWORD + movd mm2, [eax+vertex] ; get next parameter + add dlp, 8 ; dlp += 2 - mov eax, [dlp] ; offset = *(dlp + 1) - add dlp, 4 ; dlp++ - + mov eax, [dlp-4] ; offset = *(dlp + 1) punpckldq mm1, mm2 ; current param | previous param WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param @@ -284,20 +285,20 @@ test eax, eax ; at end of offset list (offset == 0) ? jnz .__paramLoop1a ; nope, copy next parameter - nop ; filler + nop jmp .__paramLoopDoneWBzero1; write buffer empty %ENDIF ; GLIDE_PACK_RGB .__fifo_aligned: - movd mm2, [vertex+X] ; y | x of vertex A + movd mm2, [vertex+x] ; y | x of vertex A movd mm1, [gc+triPacketHdr]; Packet 3 header punpckldq mm1, mm2 ; x | header WRITE_MM1_FIFO_ALIGNED ; PCI write x | header add fifo, 8 ; fifoPtr += 2*sizeof(FxU32) - movd mm1, [vertex+Y] ; 0 | y of vertex A + movd mm1, [vertex+y] ; 0 | y of vertex A %IF GLIDE_PACK_RGB %IF GLIDE_PACK_ALPHA @@ -314,10 +315,10 @@ punpcklwd mm2, mm4 ; 00000000 | 00rr00bb psrlq mm4, 24 ; 00000000 | 0000gg00 - add dlp, 8 ; skip data list entry "a" psllq mm3, 24 ; 00000000 | aa000000 - por mm4, mm2 ; 00000000 | 00rrggbb + + add dlp, 8 ; skip data list entry "a" por mm4, mm3 ; 00000000 | aarrggbb %ELSE ; !GLIDE_PACK_ALPHA ;; assumes color values < 256.0 @@ -355,17 +356,15 @@ jz .__paramLoopDoneWBone1 ; exit, write buffer contains one DWORD movd mm2, [eax+vertex] ; get next parameter - add dlp, 4 ; dlp++ - - mov eax, [dlp] ; offset = *(dlp + 1) - add dlp, 4 ; dlp++ + add dlp, 8 ; dlp += 8 + mov eax, [dlp-4] ; offset = *(dlp + 1) punpckldq mm1, mm2 ; current param | previous param + test eax, eax ; at end of offset list (offset == 0) ? - WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param - add fifo, 8 ; fifoPtr += 2*sizeof(FxU32) + add fifo, 8 ; fifoPtr += 2*sizeof(FxU32) jnz .__paramLoop1b ; nope, copy next parameter nop ; filler @@ -380,13 +379,10 @@ .__paramLoop1b: movd mm2, [eax+vertex] ; get next parameter - add fifo, 8 ; fifoPtr += 2*sizeof(FxU32) - mov eax, [dlp] ; offset = *(dlp + 1) - add dlp, 8 ; dlp += 2 + add dlp, 8 ; dlp += 2 punpckldq mm1, mm2 ; current param | previous param - test eax, eax ; at end of offset list (offset == 0) ? WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param add fifo, 8 ; fifoPtr += 2*sizeof(FxU32) @@ -395,9 +391,7 @@ jz .__paramLoopDoneWBzero1; exit, "write buffer" empty movd mm1, [eax+vertex] ; get next parameter - mov eax, [dlp] ; offset = *(dlp + 1) - - add dlp, 4 ; dlp++ + mov eax, [dlp-4] ; offset = *(dlp + 1) test eax, eax ; at end of offset list (offset == 0) ? jnz .__paramLoop1b ; nope, copy next parameter @@ -410,13 +404,13 @@ mov dlp, dlpstrt ; reset the dataList mov vertex, [esp + _vb$] ; Current vertex = B - movd mm2, [vertex+X] ; 0 | x if vertex B + movd mm2, [vertex+x] ; 0 | x if vertex B punpckldq mm1, mm2 ; x | old param WRITE_MM1_FIFO_ALIGNED ; PCI write: x | old param add fifo, 8 ; fifoPtr += 2*sizeof(FxU32) - movd mm1, [vertex+Y] ; 0 | y of vertex B + movd mm1, [vertex+y] ; 0 | y of vertex B nop ; filler %IF GLIDE_PACK_RGB @@ -502,7 +496,7 @@ movd mm2, [eax+vertex] ; get next parameter mov eax, [dlp] ; offset = *(dlp + 1) - add dlp, 4 ; dlp++ + add dlp, 8 ; dlp += 2 punpckldq mm1, mm2 ; current param | previous param WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param @@ -512,9 +506,7 @@ jz .__paramLoopDoneWBzero2; exit, "write buffer" empty movd mm1, [eax+vertex] ; get next parameter - mov eax, [dlp] ; offset = *(dlp + 1) - - add dlp, 4 ; dlp++ + mov eax, [dlp-4] ; offset = *(dlp + 1) test eax, eax ; at end of offset list (offset == 0) ? jnz .__paramLoop2b ; nope, copy next parameter @@ -522,16 +514,16 @@ jmp .__paramLoopDoneWBone2; write buffer contains one DWORD %ENDIF - .__paramLoopDoneWBzero1: mov vertex, [esp + _vb$] ; Current vertex = B mov dlp, dlpstrt ; Reset the dataList - movq mm1, [vertex+X] ; y | x of vertex B + movq mm1, [vertex+x] ; y | x of vertex B WRITE_MM1_FIFO_ALIGNED ; PCI write y | x of vertex B add fifo, 8 ; fifoPtr += 2*sizeof(FxU32) + nop %IF GLIDE_PACK_RGB %IF GLIDE_PACK_ALPHA @@ -571,7 +563,7 @@ ;; here: one DWORD in "write buffer", RGB(A) - mov eax, dword [dlp] ; get first offset from the data list + mov eax, [dlp] ; get first offset from the data list add dlp, 4 ; dlp++ test eax, eax ; end of list ? @@ -615,15 +607,13 @@ movd mm1, [eax+vertex] ; get next parameter mov eax, [dlp] ; offset = *(dlp + 1) - add dlp, 4 ; dlp++ test eax, eax ; at end of offset list (offset == 0) ? + jz .__paramLoopDoneWBone2; exit, write buffer contains one DWORD - jz .__paramLoopDoneWBone2 ; exit, write buffer contains one DWORD movd mm2, [eax+vertex] ; get next parameter + add dlp, 8 ; dlp++ - mov eax, [dlp] ; offset = *(dlp + 1) - add dlp, 4 ; dlp++ - + mov eax, [dlp-4] ; offset = *(dlp + 1) punpckldq mm1, mm2 ; current param | previous param WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param @@ -634,16 +624,16 @@ %ENDIF ; GLIDE_PACK_RGB - .__paramLoopDoneWBzero2: mov vertex, [esp + _vc$] ; Current vertex = C mov dlp, dlpstrt ; Reset the dataList - movq mm1, [vertex+X] ; y | x of vertex C + movq mm1, [vertex+x] ; y | x of vertex C WRITE_MM1_FIFO_ALIGNED ; PCI write y | x of vertex C add fifo, 8 ; fifoPtr += 2*sizeof(FxU32) + nop %IF GLIDE_PACK_RGB %IF GLIDE_PACK_ALPHA @@ -727,15 +717,13 @@ movd mm1, [eax+vertex] ; get next parameter mov eax, [dlp] ; offset = *(dlp + 1) - add dlp, 4 ; dlp++ test eax, eax ; at end of offset list (offset == 0) ? - jz .__paramLoopDoneWBone3; exit, write buffer contains one DWORD + movd mm2, [eax+vertex] ; get next parameter + add dlp, 8 ; dlp += 2 - mov eax, [dlp] ; offset = *(dlp + 1) - add dlp, 4 ; dlp++ - + mov eax, [dlp-4] ; offset = *(dlp + 1) punpckldq mm1, mm2 ; current param | previous param WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param @@ -748,7 +736,6 @@ %ENDIF ; GLIDE_PACK_RGB - .__paramLoopDoneWBone2: ;; here: "write buffer" has one DWORD left over from vertex B @@ -756,13 +743,14 @@ mov vertex, [esp + _vc$] ; Current vertex = C mov dlp, dlpstrt ; reset the dataList - movd mm2, [vertex+X] ; 0 | x if vertex C + movd mm2, [vertex+x] ; 0 | x if vertex C punpckldq mm1, mm2 ; x | old param WRITE_MM1_FIFO_ALIGNED ; PCI write: x | old param add fifo, 8 ; fifoPtr += 2*sizeof(FxU32) - movd mm1, [vertex+Y] ; 0 | y of vertex C + movd mm1, [vertex+y] ; 0 | y of vertex C + nop %IF GLIDE_PACK_RGB %IF GLIDE_PACK_ALPHA @@ -849,7 +837,7 @@ mov eax, [dlp] ; offset = *(dlp + 1) punpckldq mm1, mm2 ; current param | previous param - add dlp, 4 ; dlp++ + add dlp, 8 ; dlp += 2 WRITE_MM1_FIFO_ALIGNED ; PCI write current param | previous param add fifo, 8 ; fifoPtr += 2*sizeof(FxU32) @@ -858,9 +846,7 @@ jz .__paramLoopDoneWBzero3; exit, "write buffer" empty movd mm1, [eax+vertex] ; get next parameter - mov eax, [dlp] ; offset = *(dlp + 1) - - add dlp, 4 ; dlp++ + mov eax, [dlp-4] ; offset = *(dlp + 1) test eax, eax ; at end of offset list (offset == 0) ? jnz .__paramLoop3b ; nope, copy next parameter @@ -884,7 +870,7 @@ mov [gc + fifoPtr], fifo ; save new fifo pointer mov edx, [gc + fifoRoom] ; old fifo space available - inc ecx ; _GlideRoot.stats.trisDrawn++ + add ecx, 1 ; _GlideRoot.stats.trisDrawn++ mov ebp, [_GlideRoot + trisProcessed]; _GlideRoot.stats.trisProcessed sub eax, ebx ; new fifo ptr - old fifo ptr = additional fifo space used @@ -895,45 +881,43 @@ mov eax, 1h ; return value = triangle drawn mov [gc + fifoRoom], edx ; new fifo space available - ;; Restore trashed registers - - inc ebp ; _GlideRoot.stats.trisProcessed++ - pop esi ; restore caller's register variable - + add ebp, 1 ; _GlideRoot.stats.trisProcessed++ mov [_GlideRoot + trisProcessed], ebp ; - pop ebx ; restore caller's register variable - - pop ebp ; restore frame pointer - pop edi ; restore caller's register variable femms ; no more AMD3D code, clear FPU/MMX regs - ret ; return to caller + ;; Restore trashed registers + pop ebp ; restore frame pointer + pop ebx ; restore caller's register variable + pop esi ; restore caller's register variable + pop edi ; restore caller's register variable + ret ; return to caller %IF GLIDE_CULLING .__cullFail: mov ebp, [_GlideRoot + trisProcessed] ; triangles processed so far xor eax, eax ; return value = triangle not drawn + add ebp, 1 ; _GlideRoot.stats.trisProcessed++; + mov [_GlideRoot + trisProcessed], ebp + femms ; no more AMD3D code, clear FPU/MMX regs - ;; Restore trashed registers - inc ebp ; _GlideRoot.stats.trisProcessed++; - pop esi - - mov [_GlideRoot + trisProcessed], ebp - pop ebx - + ;; Restore trashed registers pop ebp ; restore frame pointer - pop edi + pop ebx ; restore caller's register variable + pop esi ; restore caller's register variable + pop edi ; restore caller's register variable ret %ENDIF ; GLIDE_CULLING ;;-------------------------------------------------------------------------- ;; end AMD3D version ;;-------------------------------------------------------------------------- + %else + ;;-------------------------------------------------------------------------- ;; start original code ;;-------------------------------------------------------------------------- @@ -950,7 +934,7 @@ push ebp nop - align 4 + ALIGN 32 %IF GLIDE_CULLING %define fa eax ; vtx a from caller %define fb ebx ; vtx b from caller @@ -1014,7 +998,7 @@ %ENDIF ; GLIDE_CULLING - align 4 + ALIGN 32 ;; Check to make sure that we have enough room for ;; the complete triangle packet. mov eax, [_GlideRoot + curTriSize] @@ -1031,7 +1015,7 @@ push eax call _FifoMakeRoom - add esp, 12 + ;add esp, 12 ;; Send triangle parameters @@ -1043,7 +1027,7 @@ %define packCol edi %define tempVal edi - align 4 + ALIGN 32 .__triBegin: mov fifo, [gc + fifoPtr] ; Fetch Fifo Ptr mov vOffset, 4 ; Starting vertex @@ -1054,7 +1038,7 @@ GR_FIFO_WRITE fifo, 0, eax ; Write packet header to fifo add fifo, 4 ; Advance fifo for hdr & x/y coordinate - align 4 + ALIGN 32 .__vertexStart: mov vertex, [esp + STKOFF + vOffset] ; Current vertex add fifo, 8 @@ -1062,11 +1046,11 @@ nop ; Avoid p5 agi w/ load of vertex ptr nop - mov eax, [vertex + X] ; X + mov eax, [vertex + x] ; X lea dlp, [gc + tsuDataList] ; Reset the dataList GR_FIFO_WRITE fifo, -8, eax ; PCI write X - mov eax, [vertex + Y] ; Y + mov eax, [vertex + y] ; Y xor packCol, packCol ; Clear packed color GR_FIFO_WRITE fifo, -4, eax ; PCI write Y @@ -1159,7 +1143,7 @@ jne .__paramLoop - align 4 + ALIGN 32 .__nextVertex: ;; On to the next vertex add vOffset, 4 diff --git a/glide2x/cvg/glide/src/xtexdl.asm b/glide2x/cvg/glide/src/xtexdl.asm index 1238292..3f2fbaf 100644 --- a/glide2x/cvg/glide/src/xtexdl.asm +++ b/glide2x/cvg/glide/src/xtexdl.asm @@ -19,6 +19,10 @@ ;; $Header$ ;; $Revision$ ;; $Log$ +;; Revision 1.1.2.1 2004/12/23 20:45:56 koolsmoky +;; converted to nasm syntax +;; added x86 asm, 3dnow! triangle and mmx, 3dnow! texture download optimizations +;; ;; Revision 1.1.1.1.8.1 2003/11/03 13:34:30 dborca ;; Voodoo2 happiness (DJGPP & Linux) ;; @@ -74,14 +78,14 @@ %include "xos.inc" -extrn _FifoMakeRoom +extrn _FifoMakeRoom, 12 %MACRO _grCommandTransportMakeRoom 3 push %3 push %2 push %1 call _FifoMakeRoom - add esp, 12 + ;add esp, 12 %ENDMACRO ; _grCommandTransportMakeRoom ;;; Definitions of cvg regs and glide root structures. @@ -107,19 +111,15 @@ _texData$ equ 24 + STACKOFFSET %define curS ecx ; texture s-coordinate %define fRoom edx ; room available in fifo (in bytes) -;-------------------------------------------------------------------------- - -%IFNDEF GL_SSE2 - ;-------------------------------------------------------------------------- ; ; GL_AMD3D, GL_MMX ; ;-------------------------------------------------------------------------- -segment TEXT +segment TEXT - ALIGN 32 + ALIGN 32 %IFDEF GL_AMD3D proc _grTexDownload_3DNow_MMX, 24 @@ -156,7 +156,7 @@ proc _grTexDownload_MMX, 24 sub curT, eax ; curT = maxT - minT mov fifo, [gc + fifoPtr] ; fifoPtr - mov curS, [esp + _maxS$] ; curS = maxS + mov curS, [esp + _maxS$] ; curS = maxS = scanline width in DWORDs add curT, 1 ; curT = maxT - minT + 1 %IFDEF GL_AMD3D @@ -166,23 +166,24 @@ proc _grTexDownload_MMX, 24 emms ; we'll use MMX %ENDIF - mov edx, curS ; curS = maxS = scanline width in DWORDs movd mm3, [esp + _baseAddr$] ; 0 | address of texture to download - shl curS, 2 ; scan line width (in bytes) + add curS, curS ; + add curS, curS ; scan line width (in bytes) mov eax, [esp + _minT$] ; 0 | minT mov [esp + _maxS$], curS ; save scan line width (in bytes) - shl edx, 3 ; packetHdr<21:3> = maxS = scanline width in DWORDs + mov edx, curS ; - shl eax, 9 ; TEX_ROW_ADDR_INCR(minT) = minT << 9 + add edx, edx ; packetHdr<21:3> = maxS = scanline width in DWORDs or edx, 0xc0000005 ; packetHdr<31:30> = texture port ; packetHdr<21:3> = maxS ; packetHdr<2:0> = packetType 5 movd mm1, edx ; 0 | packetHdr - movd mm2, eax ; 0 | TEX_ROW_ADDR_INCR(minT) + movd mm2, eax ; 0 | minT + psllq mm2, 9 ; 0 | TEX_ROW_ADDR_INCR(minT) = minT << 9 paddd mm3, mm2 ; 0 | texAddr = texBaseAddr + TEX_ROW_ADDR_INCR(minT) movd mm2, [gc + tex_ptr] ; 0 | gc->tex_ptr @@ -230,7 +231,7 @@ proc _grTexDownload_MMX, 24 mov [gc + fifoPtr], fifo ; store new fifoPtr jmp .startDownload ; fifo aligned, download texture now - align 32 + ALIGN 32 ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = maxS = curS ;; edx=fifoRoom, mm1 = texAddr-gc->tex_ptr|packetHdr, mm2 = TEX_ROW_ADDR_INCR(1)|0 @@ -327,394 +328,3 @@ proc _grTexDownload_MMX, 24 ret ; pop 6 DWORD parameters and return endp - -%ELSE ; !GL_SSE2 - -;-------------------------------------------------------------------------- -; -; GL_SSE2 -; -;-------------------------------------------------------------------------- - -segment TEXT - - ALIGN 32 - -proc _grTexDownload_SSE2_64, 24 - - push ebx ; save caller's register variable - mov curT, [esp + _maxT$ - 12] ; curT = maxT - - push esi ; save caller's register variable - mov eax, [esp + _minT$ - 8] ; minT - - push edi ; save caller's register variable - mov gc, [esp + _gc$ - 4] ; gc - - push ebp ; save caller's register variable - mov dataPtr, [esp + _texData$]; dataPtr - -%IFDEF GLIDE_ALT_TAB - test gc, gc - je .dlDone -; mov edx, [gc + windowed] -; test edx, 1 -; jnz .pastContextTest - mov edx, DWORD [gc+lostContext] - mov ecx, [edx] - test ecx, 1 - jnz .dlDone -;.pastContextTest: -%ENDIF - - sub curT, eax ; curT = maxT - minT - mov fifo, [gc + fifoPtr] ; fifoPtr - - mov curS, [esp + _maxS$] ; curS = maxS - add curT, 1 ; curT = maxT - minT + 1 - - mov edx, curS ; curS = maxS = scanline width in DWORDs - movd xmm3,[esp + _baseAddr$] ; 0 | 0 | 0 | address of texture to download - - shl curS, 2 ; scan line width (in bytes) - mov eax, [esp + _minT$] ; 0 | 0 | 0 | minT - - mov [esp + _maxS$], curS ; save scan line width (in bytes) - shl edx, 3 ; packetHdr<21:3> = maxS = scanline width in DWORDs - - imul eax, curS ; TEX_ROW_ADDR_INCR(minT) = minT * TEX_ROW_ADDR_INCR(1) - - movd xmm2,curS ; 0 | 0 | TEX_ROW_ADDR_INCR(1) - or edx, 00000005h ; packetHdr<31:30> = lfb port - ; packetHdr<21:3> = maxS - ; packetHdr<2:0> = packetType 5 - - movd xmm1,edx ; 0 | 0 | packetHdr - movd xmm4,eax ; 0 | 0 | TEX_ROW_ADDR_INCR(minT) - - psllq xmm2,32 ; 0 | 0 | TEX_ROW_ADDR_INCR(1) | 0 - paddd xmm3,xmm4 ; 0 | 0 | texAddr = texBaseAddr + TEX_ROW_ADDR_INCR(minT) - - mov fRoom, [gc + fifoRoom] ; get available fifoRoom (in bytes) - punpckldq xmm1,xmm3 ; 0 | 0 | hdr2 = texAddr | hdr1 = packetHdr - - ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = curS = maxS - ;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0 - - test fifo, 4 ; is fifo QWORD aligned ? - jz .startDownload ; yup, start texture download - - cmp fRoom, 4 ; enough room for NULL packet in fifo? - jge .xmmAlignFifo ; yes, write NULL packet to align fifo - -%ifdef USE_PACKET_FIFO - _grCommandTransportMakeRoom 4, 0, __LINE__; make fifo room -%endif - - mov fifo, [gc + fifoPtr] ; fifoPtr modified by _grCommandTransportMakeRoom, reload - - mov fRoom, [gc + fifoRoom] ; fifoRoom modified by _grCommandTransportMakeRoom, reload - mov curS, [esp + _maxS$] ; reload maxS (destroyed by call to _grCommandTransportMakeRoom) - - test fifo, 4 ; new fifoPtr QWORD aligned ? - jz .startDownload ; yup, start texture download - -.xmmAlignFifo: - - mov DWORD [fifo], 0 ; write NULL packet - sub fRoom, 4 ; fifoRoom -= 4 - - mov [gc + fifoRoom], fRoom ; store new fifoRoom - add fifo, 4 ; fifoPtr += 4 - -%IFDEF GLIDE_DEBUG - mov [gc + checkPtr], fifo ; checkPtr -%ENDIF - - mov [gc + fifoPtr], fifo ; store new fifoPtr - jmp .startDownload ; fifo aligned, download texture now - - align 32 - - ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = maxS = curS - ;; edx=fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0 - -.loopT: - -%IFDEF GLIDE_DEBUG - - ;; Make sure that we have a QWORD aligned fifoPtr; force GP if not aligned - - test fifo, 4 ; is fifoPtr QWORD aligned ? - jz .alignmentOK ; yup, continue - - xor eax, eax ; create 0 - mov [eax], eax ; move to DS:[0] forces GP -.alignmentOK: -%ENDIF ; GLIDE_DEBUG - - ;; Compute packet header words - ;; hdr1: downloadSpace[31:30] numWords[21:3] packetType[2:0] - ;; hdr2: download address[29:0] - - movq [fifo],xmm1 ; store hdr2 | hdr1 - add fifo, 8 ; increment fifo ptr (hdr1 + hdr2) - - ;; S coordinate inner loop unrolled for 8 texels a write - -.loopS: - - movq xmm0,[dataPtr] ; load 64 bit data (8 texels) - add fifo, 8 ; pre-increment fifoPtr += 2 * sizeof(FxU32) - - add dataPtr, 8 ; dataPtr += 2 * sizeof(FxU32) - sub curS, 8 ; curS -= 2 * sizeof(FxU32) - - movq [fifo - 8],xmm0 ; *fifoPtr = texelData[64 bits] - jnz .loopS ; loop while curS > 0 - - mov ecx, [gc + fifoPtr] ; old fifo ptr - nop ; filler - - mov eax, fifo ; new fifo ptr - mov [gc + fifoPtr], fifo ; save new fifo ptr - -%IFDEF GLIDE_DEBUG - mov [gc + checkPtr], fifo ; checkPtr -%ENDIF - - sub eax, ecx ; new fifo ptr - old fifo ptr = fifo space used up - mov curS, [esp + _maxS$] ; curS = maxS = width of scanline (bytes) - - sub fRoom, eax ; new fifo space available = old fifo space available - fifo space used up = new fifo space available - sub curT, 1 ; curT-- - - mov [gc + fifoRoom], fRoom ; save new fifo space available - jz .dlDone ; loop while curT > 0 - - ;; Check for room to write the next texture scanline - - ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo - ;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0 - - paddd xmm1,xmm2 ; 0 | 0 | texAddr+=TEX_ROW_ADDR_INCR(1) | packetHdr - mov esp, esp ; filler -.startDownload: - lea eax, [curS+8] ; fifo space needed = scan line width + header size - - cmp fRoom, eax ; fifo space available >= fifo space required ? - jge .loopT ; yup, write next scan line - -%ifdef USE_PACKET_FIFO - _grCommandTransportMakeRoom eax, 0, __LINE__; make fifo room (if fifoPtr QWORD aligned before -%endif - - mov fifo, [gc + fifoPtr] ; fifoPtr was modified by _grCommandTransportMakeRoom, reload - - mov fRoom, [gc + fifoRoom] ; fifoRoom was modified by _grCommandTransportMakeRoom, reload - mov curS, [esp + _maxS$] ; curS = maxS = width of scanline (bytes) - jmp .loopT ; we now have enough fifo room, write next scanline - -.dlDone: - pop ebp ; restore caller's register variable - pop edi ; restore caller's register variable - - pop esi ; restore caller's register variable - pop ebx ; restore caller's register variable - - ret ; pop 6 DWORD parameters and return -endp - - - -segment TEXT - - ALIGN 32 - -proc _grTexDownload_SSE2_128, 24 - - push ebx ; save caller's register variable - mov curT, [esp + _maxT$ - 12] ; curT = maxT - - push esi ; save caller's register variable - mov eax, [esp + _minT$ - 8] ; minT - - push edi ; save caller's register variable - mov gc, [esp + _gc$ - 4] ; gc - - push ebp ; save caller's register variable - mov dataPtr, [esp + _texData$]; dataPtr - -%IFDEF GLIDE_ALT_TAB - test gc, gc - je .dlDone -; mov edx, [gc + windowed] -; test edx, 1 -; jnz .pastContextTest - mov edx, DWORD [gc+lostContext] - mov ecx, [edx] - test ecx, 1 - jnz .dlDone -;.pastContextTest: -%ENDIF - - sub curT, eax ; curT = maxT - minT - mov fifo, [gc + fifoPtr] ; fifoPtr - - mov curS, [esp + _maxS$] ; curS = maxS - add curT, 1 ; curT = maxT - minT + 1 - - mov edx, curS ; curS = maxS = scanline width in DWORDs - movd xmm3,[esp + _baseAddr$] ; 0 | 0 | 0 | address of texture to download - - shl curS, 2 ; scan line width (in bytes) - mov eax, [esp + _minT$] ; 0 | minT - - mov [esp + _maxS$], curS ; save scan line width (in bytes) - shl edx, 3 ; packetHdr<21:3> = maxS = scanline width in DWORDs - - imul eax, curS ; TEX_ROW_ADDR_INCR(minT) = minT * TEX_ROW_ADDR_INCR(1) - - movd xmm2,curS ; 0 | 0 | 0 | TEX_ROW_ADDR_INCR(1) - or edx, 00000005h ; packetHdr<31:30> = lfb port - ; packetHdr<21:3> = maxS - ; packetHdr<2:0> = packetType 5 - - movd xmm1,edx ; 0 | 0 | 0 | packetHdr - movd xmm4,eax ; 0 | 0 | 0 | TEX_ROW_ADDR_INCR(minT) - - psllq xmm2,32 ; 0 | 0 | TEX_ROW_ADDR_INCR(1) | 0 - paddd xmm3,xmm4 ; 0 | 0 | 0 | texAddr = texBaseAddr + TEX_ROW_ADDR_INCR(minT) - - mov fRoom, [gc + fifoRoom] ; get available fifoRoom (in bytes) - punpckldq xmm1,xmm3 ; 0 | 0 | hdr2 = texAddr | hdr1 = packetHdr - - ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = curS = maxS - ;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0 - - test fifo, 4 ; is fifo QWORD aligned ? - jz .startDownload ; yup, start texture download - - cmp fRoom, 4 ; enough room for NULL packet in fifo? - jge .xmmAlignFifo ; yes, write NULL packet to align fifo - -%ifdef USE_PACKET_FIFO - _grCommandTransportMakeRoom 4, 0, __LINE__; make fifo room -%endif - - mov fifo, [gc + fifoPtr] ; fifoPtr modified by _grCommandTransportMakeRoom, reload - - mov fRoom, [gc + fifoRoom] ; fifoRoom modified by _grCommandTransportMakeRoom, reload - mov curS, [esp + _maxS$] ; reload maxS (destroyed by call to _grCommandTransportMakeRoom) - - test fifo, 4 ; new fifoPtr QWORD aligned ? - jz .startDownload ; yup, start texture download - -.xmmAlignFifo: - - mov DWORD [fifo], 0 ; write NULL packet - sub fRoom, 4 ; fifoRoom -= 4 - - mov [gc + fifoRoom], fRoom ; store new fifoRoom - add fifo, 4 ; fifoPtr += 4 - -%IFDEF GLIDE_DEBUG - mov [gc + checkPtr], fifo ; checkPtr -%ENDIF - - mov [gc + fifoPtr], fifo ; store new fifoPtr - jmp .startDownload ; fifo aligned, download texture now - - align 32 - - ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = maxS = curS - ;; edx=fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0 - -.loopT: - -%IFDEF GLIDE_DEBUG - - ;; Make sure that we have a QWORD aligned fifoPtr; force GP if not aligned - - test fifo, 4 ; is fifoPtr QWORD aligned ? - jz .alignmentOK ; yup, continue - - xor eax, eax ; create 0 - mov [eax], eax ; move to DS:[0] forces GP -.alignmentOK: -%ENDIF ; GLIDE_DEBUG - - ;; Compute packet header words - ;; hdr1: downloadSpace[31:30] numWords[21:3] packetType[2:0] - ;; hdr2: download address[29:0] - - movq [fifo],xmm1 ; store hdr2 | hdr1 - add fifo, 8 ; increment fifo ptr (hdr1 + hdr2) - - ;; S coordinate inner loop unrolled for 8 texels a write - -.loopS: - - movdqu xmm0, [dataPtr] ; load 128 bit data (8 texels) ; isn't 16 bytes aligned? - add fifo, 16 ; pre-increment fifoPtr += 4 * sizeof(FxU32) - - add dataPtr, 16 ; dataPtr += 4 * sizeof(FxU32) - sub curS, 16 ; curS -= 4 * sizeof(FxU32) - - movdqu [fifo - 16], xmm0 ; *fifoPtr = texelData[128 bits] ; isn't 16 bytes aligned? - jnz .loopS ; loop while curS > 0 - - mov ecx, [gc + fifoPtr] ; old fifo ptr - nop ; filler - - mov eax, fifo ; new fifo ptr - mov [gc + fifoPtr], fifo ; save new fifo ptr - -%IFDEF GLIDE_DEBUG - mov [gc + checkPtr], fifo ; checkPtr -%ENDIF - - sub eax, ecx ; new fifo ptr - old fifo ptr = fifo space used up - mov curS, [esp + _maxS$] ; curS = maxS = width of scanline (bytes) - - sub fRoom, eax ; new fifo space available = old fifo space available - fifo space used up = new fifo space available - sub curT, 1 ; curT-- - - mov [gc + fifoRoom], fRoom ; save new fifo space available - jz .dlDone ; loop while curT > 0 - - ;; Check for room to write the next texture scanline - - ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo - ;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0 - - paddd xmm1,xmm2 ; 0 | 0 | texAddr+=TEX_ROW_ADDR_INCR(1) | packetHdr - mov esp, esp ; filler -.startDownload: - lea eax, [curS+8] ; fifo space needed = scan line width + header size - - cmp fRoom, eax ; fifo space available >= fifo space required ? - jge .loopT ; yup, write next scan line - -%ifdef USE_PACKET_FIFO - _grCommandTransportMakeRoom eax, 0, __LINE__; make fifo room (if fifoPtr QWORD aligned before -%endif - - mov fifo, [gc + fifoPtr] ; fifoPtr was modified by _grCommandTransportMakeRoom, reload - - mov fRoom, [gc + fifoRoom] ; fifoRoom was modified by _grCommandTransportMakeRoom, reload - mov curS, [esp + _maxS$] ; curS = maxS = width of scanline (bytes) - jmp .loopT ; we now have enough fifo room, write next scanline - -.dlDone: - pop ebp ; restore caller's register variable - pop edi ; restore caller's register variable - - pop esi ; restore caller's register variable - pop ebx ; restore caller's register variable - - ret ; pop 6 DWORD parameters and return -endp - - -%ENDIF ; GL_SSE2