diff --git a/glide3x/cvg/glide3/src/cpudtect.asm b/glide3x/cvg/glide3/src/cpudtect.asm
index 6eb0ce0..17051f8 100644
--- a/glide3x/cvg/glide3/src/cpudtect.asm
+++ b/glide3x/cvg/glide3/src/cpudtect.asm
@@ -19,6 +19,9 @@
 ;;
 ;; $Header$
 ;; $Log$
+;; Revision 1.1.1.1  1999/12/07 21:42:30  joseph
+;; Initial checkin into SourceForge.
+;;
 ;; 
 ;; 1     10/08/98 11:30a Brent
 ;; 
@@ -35,10 +38,7 @@
 ;;
 ;;
 
-TITLE  cpudtect.asm
-
-.586P
-.model FLAT,C                   ; Flat memory, mangle publics with leading '_'
+%include "xos.inc"
 
 ;;      Data for data segment goes here
 ;_DATA   SEGMENT DWORD USE32 PUBLIC 'DATA'; 
@@ -46,14 +46,14 @@ TITLE  cpudtect.asm
     
 ;;; Some useful constants
 ; CPU Type
-CPUTypeUnknown  = 0ffffffffh
-CPUTypePrePent  = 4h
-CPUTypeP5       = 5h    
-CPUTypeP6       = 6h    
+CPUTypeUnknown  equ 0ffffffffh
+CPUTypePrePent  equ 4h
+CPUTypeP5       equ 5h
+CPUTypeP6       equ 6h
         
 ;;; References to external data:
     
-_TEXT   SEGMENT
+segment		TEXT
 ;;
 ;;  _cpu_detect_asm - detect the type of CPU 
 ;; 
@@ -63,10 +63,8 @@ _TEXT   SEGMENT
 ;;
 ;;  returns 4 for non-pen
 
-PUBLIC  _cpu_detect_asm
-_cpu_detect_asm PROC NEAR
+proc _cpu_detect_asm
 P6Stuff:
-    .586
     pushad                              ; save all regs.
 
     ; First, determine whether CPUID instruction is available.
@@ -177,45 +175,38 @@ UnknownVendor:
     mov eax, 0ffffffffh
     ret
         
-_cpu_detect_asm ENDP
+endp
 
 
 ;------------------------------------------------------------------------------   
 ; this routine sets the precision to single
 ; which effects all adds, mults, and divs
     align 4                 ; 
-    PUBLIC  single_precision_asm
-single_precision_asm PROC NEAR
-.586
+proc single_precision_asm
     push  eax       ; make room
     fnclex          ; clear pending exceptions    
-    fstcw WORD PTR [esp]
-    mov   eax, DWORD PTR [esp]
+    fstcw WORD [esp]
+    mov   eax, DWORD [esp]
     and   eax, 0000fcffh  ; clear bits 9:8
-    mov   DWORD PTR [esp], eax
-    fldcw WORD PTR [esp]
+    mov   DWORD [esp], eax
+    fldcw WORD [esp]
     pop   eax
-    ret   0
-single_precision_asm ENDP
+    ret
+endp
 
 ;------------------------------------------------------------------------------   
 ; this routine sets the precision to double
 ; which effects all adds, mults, and divs
     align 4                 ; 
-    PUBLIC  double_precision_asm
-double_precision_asm PROC NEAR
-.586
+proc double_precision_asm
     push  eax       ; make room
     fnclex          ; clear pending exceptions    
-    fstcw WORD PTR [esp]
-    mov   eax, DWORD PTR [esp]
+    fstcw WORD [esp]
+    mov   eax, DWORD [esp]
     and   eax, 0000fcffh  ; clear bits 9:8
     or    eax, 000002ffh  ; set 9:8 to 10
-    mov   DWORD PTR [esp], eax
-    fldcw WORD PTR [esp]
+    mov   DWORD [esp], eax
+    fldcw WORD [esp]
     pop   eax
-    ret   0
-double_precision_asm ENDP
-    
-_TEXT ENDS
-END
\ No newline at end of file
+    ret
+endp
diff --git a/glide3x/cvg/glide3/src/diget.c b/glide3x/cvg/glide3/src/diget.c
index fb406ef..91574cb 100644
--- a/glide3x/cvg/glide3/src/diget.c
+++ b/glide3x/cvg/glide3/src/diget.c
@@ -19,6 +19,9 @@
 **
 ** $Header$
 ** $Log$
+** Revision 1.1.1.1  1999/12/07 21:42:30  joseph
+** Initial checkin into SourceForge.
+**
 ** 
 ** 1     10/08/98 11:30a Brent
 ** 
@@ -476,10 +479,10 @@ GR_DIENTRY(grGet, FxU32, (FxU32 pname, FxU32 plength, FxI32 *params))
       switch(hwc->SSTs[_GlideRoot.current_sst].type) {
       case GR_SSTTYPE_VOODOO:
       case GR_SSTTYPE_Voodoo2:
-        *params = hwc->SSTs[_GlideRoot.current_sst].sstBoard.VoodooConfig.fbRam;
+        *params = hwc->SSTs[_GlideRoot.current_sst].sstBoard.VoodooConfig.fbRam << 20;
         break;
       case GR_SSTTYPE_SST96:
-        *params = hwc->SSTs[_GlideRoot.current_sst].sstBoard.SST96Config.fbRam;
+        *params = hwc->SSTs[_GlideRoot.current_sst].sstBoard.SST96Config.fbRam << 20;
         break;
       default:
         *params = 0;    /* XXX UMA architecture */
@@ -493,10 +496,10 @@ GR_DIENTRY(grGet, FxU32, (FxU32 pname, FxU32 plength, FxI32 *params))
       switch(hwc->SSTs[_GlideRoot.current_sst].type) {
       case GR_SSTTYPE_VOODOO:
       case GR_SSTTYPE_Voodoo2:
-        *params = hwc->SSTs[_GlideRoot.current_sst].sstBoard.VoodooConfig.tmuConfig[0].tmuRam;
+        *params = hwc->SSTs[_GlideRoot.current_sst].sstBoard.VoodooConfig.tmuConfig[0].tmuRam << 20;
         break;
       case GR_SSTTYPE_SST96:
-        *params = hwc->SSTs[_GlideRoot.current_sst].sstBoard.SST96Config.tmuConfig.tmuRam;
+        *params = hwc->SSTs[_GlideRoot.current_sst].sstBoard.SST96Config.tmuConfig.tmuRam << 20;
         break;
       default:
         *params = 0;    /* XXX UMA architecture */
@@ -772,6 +775,32 @@ GR_DIENTRY(grGetString, const char *, (FxU32 pname))
 #undef FN_NAME
 } /* grGetString */
 
+/*-------------------------------------------------------------------
+  Function: grGetRegistryOrEnvironmentStringExt
+  Date: 4/17/2000       
+  Implementor(s): atom
+  Description: 
+
+    This is here so the spooky code for finding the correct registry
+    tweak path in 9x/NT/2K does not have to be duplicated in 3dfxogl.
+
+  Arguments: char* to the name of the setting to check for.
+  
+  Return: char* to the requested entry either from the registry
+          or the environment settings.  NULL on error.
+  -------------------------------------------------------------------*/
+GR_DIENTRY(grGetRegistryOrEnvironmentString, char*, (char* theEntry))
+{
+#define FN_NAME "grGetRegistryOrEnvironmentString"
+  char*  retval ;
+
+  retval = getenv(theEntry) ;
+
+  return retval ;
+
+#undef FN_NAME
+} /* grGetRegistryOrEnvironmentString */
+
 /*-------------------------------------------------------------------
   Function: grReset
   Date: 16-Dec-97
@@ -881,6 +910,10 @@ GR_DIENTRY(grGetProcAddress, GrProc, (char *procName))
       return (GrProc)_GlideRoot.deviceArchProcs.curLineProc;
     if (!strcmp(procName, "guQueryResolutionXYExt"))
       return (GrProc)guQueryResolutionXY;
+    if (!strcmp(procName, "grGetRegistryOrEnvironmentStringExt"))
+      return (GrProc)grGetRegistryOrEnvironmentString;
+    if (!strcmp(procName, "grTexDownloadTableExt"))
+      return (GrProc)grTexDownloadTableExt;
   }
   return NULL;
 
diff --git a/glide3x/cvg/glide3/src/distate.c b/glide3x/cvg/glide3/src/distate.c
index c625b8a..db39ff7 100644
--- a/glide3x/cvg/glide3/src/distate.c
+++ b/glide3x/cvg/glide3/src/distate.c
@@ -19,6 +19,9 @@
  **
  ** $Header$
  ** $Log$
+ ** Revision 1.1.1.1.8.1  2003/06/29 18:45:55  guillemj
+ ** Fixed preprocessor invalid token errors.
+ **
  ** Revision 1.1.1.1  1999/12/07 21:42:31  joseph
  ** Initial checkin into SourceForge.
  **
@@ -507,6 +510,57 @@ GR_DIENTRY(grDepthBufferMode, void , (GrDepthBufferMode_t mode) )
  #undef FN_NAME
 } /* grDepthBufferMode */
 
+/*-------------------------------------------------------------------
+  Function: grStipplePattern
+  Date: 23-Nov-2000
+  Implementor(s): alanh
+  Description:
+  
+  Arguments:
+  
+  Return:
+  -------------------------------------------------------------------*/
+GR_DIENTRY(grStipplePattern, void , (GrStipplePattern_t stipple))
+{
+ #define FN_NAME "grStipplePattern"
+
+  /* [dBorca] TODO
+   *
+  GR_BEGIN_NOFIFOCHECK("grStipplePattern\n", 85);
+
+  INVALIDATE(stipple);
+
+  STOREARG(grStipplePattern, stipple);
+   */
+
+ #undef FN_NAME
+} /* grStipplePattern */
+
+/*-------------------------------------------------------------------
+  Function: grStippleMode
+  Date: 23-Nov-2000
+  Implementor(s): alanh
+  Description:
+  
+  Arguments:
+  
+  Return:
+  -------------------------------------------------------------------*/
+GR_DIENTRY(grStippleMode, void , (GrStippleMode_t mode) )
+{
+ #define FN_NAME "grStippleMode"
+
+  /* [dBorca] TODO
+   *
+  GR_BEGIN_NOFIFOCHECK("grStippleMode\n", 85);
+
+  INVALIDATE(fbzMode);
+
+  STOREARG(grStippleMode, mode);
+   */
+
+ #undef FN_NAME
+} /* grStippleMode */
 
 /*-------------------------------------------------------------------
   Function: grDitherMode
diff --git a/glide3x/cvg/glide3/src/fxgasm.c b/glide3x/cvg/glide3/src/fxgasm.c
index eb78c91..09139de 100644
--- a/glide3x/cvg/glide3/src/fxgasm.c
+++ b/glide3x/cvg/glide3/src/fxgasm.c
@@ -37,7 +37,29 @@
  * macros for creating assembler offset files
  *----------------------------------------------------------------------*/
 
-#ifndef __linux__
+#if 1	/* defined(NASM) - default */
+#define NEWLINE printf("\n")
+#define COMMENT printf(";----------------------------------------------------------------------\n")
+
+#define HEADER(str)     NEWLINE; COMMENT; \
+                        printf("; Assembler offsets for %s struct\n",str);\
+                        COMMENT; NEWLINE
+
+#define OFFSET(p,o,pname) if (hex) \
+        printf("%s\tequ %08xh\n",pname,((int)&p.o)-(int)&p); \
+    else printf("%s\tequ %10d\n",pname,((int)&p.o)-(int)&p)
+
+#define OFFSET2(p,o,pname) if (hex) \
+        printf("%s\tequ %08xh\n",pname,((int)&o)-(int)&p); \
+    else printf("%s\tequ %10d\n",pname,((int)&o)-(int)&p)
+
+#define SIZEOF(p,pname) if (hex) \
+        printf("SIZEOF_%s\tequ %08lxh\n",pname,sizeof(p)); \
+    else printf("SIZEOF_%s\tequ %10ld\n",pname,sizeof(p))
+
+#else	/* !NASM */
+
+#if !defined(__linux__) && !defined(__DJGPP__)
 #define NEWLINE printf("\n")
 #define COMMENT printf(";----------------------------------------------------------------------\n")
 
@@ -57,7 +79,7 @@
         printf("SIZEOF_%s\t= %08xh\n",pname,sizeof(p)); \
     else printf("SIZEOF_%s\t= %10d\n",pname,sizeof(p))
 
-#else
+#else   /* defined(__linux__) || defined(__DJGPP__) */
 
 #define NEWLINE printf("\n");
 #define COMMENT printf("/*----------------------------------------------------------------------*/\n")
@@ -67,17 +89,19 @@
                         COMMENT; NEWLINE
 
 #define OFFSET(p,o,pname) if (hex) \
-        printf("#define %s %08x\n",pname,((int)&p.o)-(int)&p); \
+        printf("#define %s 0x%08x\n",pname,((int)&p.o)-(int)&p); \
     else printf("#define %s %10d\n",pname,((int)&p.o)-(int)&p)
 
 #define OFFSET2(p,o,pname) if (hex) \
-        printf("#define %s %08x\n",pname,((int)&o)-(int)&p); \
+        printf("#define %s 0x%08x\n",pname,((int)&o)-(int)&p); \
     else printf("#define %s %10d\n",pname,((int)&o)-(int)&p)
 
 #define SIZEOF(p,pname) if (hex) \
-        printf("#define SIZEOF_%s %08x\n",pname,sizeof(p)); \
+        printf("#define SIZEOF_%s 0x%08x\n",pname,sizeof(p)); \
     else printf("#define SIZEOF_%s %10d\n",pname,sizeof(p))
-#endif
+#endif  /* defined(__linux__) || defined(__DJGPP__) */
+
+#endif  /* defined(NASM)*/
 
 int
 main (int argc, char **argv)
@@ -87,7 +111,7 @@ main (int argc, char **argv)
     static GrGC gc;
 
 #if !GLIDE_HW_TRI_SETUP
-    static Sstregs sst;
+    static SstRegs sst;
     static struct dataList_s dl;
 #endif /* !GLIDE_HW_TRI_SETUP */
 
@@ -99,18 +123,18 @@ main (int argc, char **argv)
         printf("#define __FX_INLINE_H__\n");
         printf("\n");
 
-        printf("#define kCurGCOffset   0x%XUL\n",
+        printf("#define kCurGCOffset   0x%lXUL\n",
                offsetof(struct _GlideRoot_s, curGC));
 
 #if GLIDE_DISPATCH_SETUP
-        printf("#define kTriProcOffset 0x%XUL\n",
+        printf("#define kTriProcOffset 0x%lXUL\n",
                offsetof(struct GrGC_s, curArchProcs.triSetupProc));
-	printf("#define kGCStateInvalid 0x%XUL\n",
+	printf("#define kGCStateInvalid 0x%lXUL\n",
 	       offsetof(struct GrGC_s, state.invalid));
 #endif /* GLIDE_DISPATCH_SETUP */
         
         printf("/* The # of 2-byte entries in the hw fog table */\n");
-        printf("#define kInternalFogTableEntryCount 0x%XUL\n",
+        printf("#define kInternalFogTableEntryCount 0x%lXUL\n",
                sizeof(dummyRegs.fogTable) >> 1);
 
         printf("\n");
diff --git a/glide3x/cvg/glide3/src/fxglide.h b/glide3x/cvg/glide3/src/fxglide.h
index 8f57366..85c3d12 100644
--- a/glide3x/cvg/glide3/src/fxglide.h
+++ b/glide3x/cvg/glide3/src/fxglide.h
@@ -19,6 +19,9 @@
 **
 ** $Header$
 ** $Log$
+** Revision 1.1.1.1.8.2  2003/07/25 07:13:41  dborca
+** removed debris
+**
 ** Revision 1.1.1.1.8.1  2003/06/29 18:45:55  guillemj
 ** Fixed preprocessor invalid token errors.
 **
@@ -2937,7 +2940,7 @@ do { \
 #define REG_GROUP_SETF_CLAMP(__regBase, __regAddr, __val) \
 do { \
   const FxU32 fpClampVal = FP_FLOAT_CLAMP(__val); \
-  REG_GROUP_ASSERT(__regAddr, fpClampVal, FXTRUE); \  
+  REG_GROUP_ASSERT(__regAddr, fpClampVal, FXTRUE); \
   SET(((FxU32*)(__regBase))[offsetof(SstRegs, __regAddr) >> 2], fpClampVal); \
   GR_INC_SIZE(sizeof(FxU32)); \
 } while(0)
@@ -3360,5 +3363,9 @@ extern void
 _grSliOriginClear(void); 
 #endif /* (GLIDE_PLATFORM & GLIDE_HW_CVG) */
 
+GR_ENTRY(grTexDownloadTableExt,
+         void,
+         (GrChipID_t tmu, GrTexTable_t type,  void *data));
+
 #endif /* __FXGLIDE_H__ */
 
diff --git a/glide3x/cvg/glide3/src/g3df.c b/glide3x/cvg/glide3/src/g3df.c
index 20446ad..8c3ce3d 100644
--- a/glide3x/cvg/glide3/src/g3df.c
+++ b/glide3x/cvg/glide3/src/g3df.c
@@ -19,6 +19,9 @@
 **
 ** $Header$
 ** $Log$
+** Revision 1.1.1.1  1999/12/07 21:42:32  joseph
+** Initial checkin into SourceForge.
+**
 ** 
 ** 1     10/08/98 11:30a Brent
 ** 
@@ -390,18 +393,13 @@ GR_DIENTRY(gu3dfGetInfo, FxBool,
   /*
   ** determine the color format of the input image
   */
-#ifdef __GNUC__
-  /* This function is not found in libgcc.a */
   {
-    char* tempStr = (char*)color_format;
-    while(*tempStr != '\0') *tempStr++ = toupper(*tempStr);
+    char *tempStr = (char*)color_format;
+    while (*tempStr != '\0') {
+          *tempStr = toupper(*tempStr);
+          tempStr++;
+    }
   }
-#else
-        {
-                extern char* strupr(char*);
-        strupr(color_format);
-  }
-#endif /* __GNUC__ */
 
   i = 0;
   format_found = FXFALSE;
diff --git a/glide3x/cvg/glide3/src/glide.h b/glide3x/cvg/glide3/src/glide.h
index 4322353..da0f88b 100644
--- a/glide3x/cvg/glide3/src/glide.h
+++ b/glide3x/cvg/glide3/src/glide.h
@@ -52,6 +52,7 @@ extern "C" {
 typedef FxU32 GrColor_t;
 typedef FxU8  GrAlpha_t;
 typedef FxU32 GrMipMapId_t;
+typedef FxU32 GrStipplePattern_t;
 typedef FxU8  GrFog_t;
 typedef FxU32 GrContext_t;
 typedef int (FX_CALL *GrProc)();
@@ -240,6 +241,11 @@ typedef FxI32 GrDitherMode_t;
 #define GR_DITHER_2x2           0x1
 #define GR_DITHER_4x4           0x2
 
+typedef FxI32 GrStippleMode_t;
+#define GR_STIPPLE_DISABLE	0x0
+#define GR_STIPPLE_PATTERN	0x1
+#define GR_STIPPLE_ROTATE	0x2
+
 typedef FxI32 GrFogMode_t;
 #define GR_FOG_DISABLE                     0x0
 #define GR_FOG_WITH_TABLE_ON_FOGCOORD_EXT  0x1
diff --git a/glide3x/cvg/glide3/src/gsst.c b/glide3x/cvg/glide3/src/gsst.c
index f63755e..d40f3d5 100644
--- a/glide3x/cvg/glide3/src/gsst.c
+++ b/glide3x/cvg/glide3/src/gsst.c
@@ -19,6 +19,9 @@
 **
 ** $Header$
 ** $Log$
+** Revision 1.1.1.1  1999/12/07 21:42:33  joseph
+** Initial checkin into SourceForge.
+**
 ** 
 ** 1     10/08/98 11:30a Brent
 ** 
@@ -1684,6 +1687,28 @@ GR_ENTRY(grSstWinClose, FxBool, (GrContext_t context))
 #undef FN_NAME
 } /* grSstWinClose */
 
+/*-------------------------------------------------------------------
+  Function: grSetNumPendingBuffers
+  Date: 13-Oct-2000
+  Implementor(s): mmcclure
+  Description:
+  
+  Allow the application to supply the number of pending buffers
+
+  Arguments:
+
+  NumPendingBuffers - Sent to force number of pending buffers
+  
+  Return:
+  -------------------------------------------------------------------*/
+GR_DIENTRY(grSetNumPendingBuffers, void, (FxI32 NumPendingBuffers))
+{
+  /* [dBorca] TODO
+   *
+  _GlideRoot.environment.swapPendingCount = NumPendingBuffers;
+   */
+}
+
 /*-------------------------------------------------------------------
   Function: grSelectContext
   Date: 18-Jan-98
diff --git a/glide3x/cvg/glide3/src/gtexdl.c b/glide3x/cvg/glide3/src/gtexdl.c
index 22b89f6..05434f4 100644
--- a/glide3x/cvg/glide3/src/gtexdl.c
+++ b/glide3x/cvg/glide3/src/gtexdl.c
@@ -19,6 +19,9 @@
 **
 ** $Header$
 ** $Log$
+** Revision 1.1.1.1  1999/12/07 21:42:34  joseph
+** Initial checkin into SourceForge.
+**
 ** 
 ** 1     10/08/98 11:30a Brent
 ** 
@@ -171,9 +174,297 @@ extern const int _grMipMapHostWH[G3_ASPECT_TRANSLATE(GR_ASPECT_1x8)+1][G3_LOD_TR
 extern const int _grMipMapHostWH[G3_ASPECT_TRANSLATE(GR_ASPECT_LOG2_1x8)+1][G3_LOD_TRANSLATE(GR_LOD_LOG2_1)+1][2];
 #endif
 
-#ifndef  GLIDE3_ALPHA
-#define GLIDE_POINTCAST_PALETTE
+#define GLIDE_POINTCAST_PALETTE 1
+
+/*---------------------------------------------------------------------------
+** _grTexDownloadNccTableExt
+**
+** Downloads an ncctable to the specified _physical_ TMU(s).  This
+** function is called internally by Glide and should not be executed
+** by an application.
+*/
+GR_DDFUNC(_grTexDownloadNccTableExt,
+          void,
+          (GrChipID_t tmu, FxU32 which, const GuNccTable *table, int start, int end))
+{
+#define FN_NAME "_grTexDownloadNccTableExt"
+  GR_BEGIN_NOFIFOCHECK(FN_NAME,89);
+  GDBG_INFO_MORE(gc->myLevel,"(%d,%d, 0x%x, %d,%d)\n",tmu,which,table,start,end);
+  GR_ASSERT(start==0);
+  GR_ASSERT(end==11);
+
+  /* check for null pointer */
+  if (table == NULL) return;
+
+  _GlideRoot.stats.palDownloads++;
+  _GlideRoot.stats.palBytes += (end-start+1)<<2;
+
+  if (gc->tmu_state[tmu].ncc_table[which] != table) {
+    SstRegs* texHW;
+    int i;
+#ifdef GLIDE_POINTCAST_PALETTE
+    texHW = SST_TMU(hw,tmu);
+#else
+    texHW = SST_CHIP(hw,0xE);
 #endif
+
+    if (which == 0) {
+#ifdef GLIDE_POINTCAST_PALETTE
+      REG_GROUP_BEGIN((0x02UL << tmu), nccTable0, 12, 0x0FFF);
+#else
+      REG_GROUP_BEGIN(0x0EUL, nccTable0, 12, 0x0FFF);
+#endif
+      for (i = 0; i < 12; i++) REG_GROUP_SET(texHW, nccTable0[i], table->packed_data[i]);
+      REG_GROUP_END();
+    } else {
+#ifdef GLIDE_POINTCAST_PALETTE
+      REG_GROUP_BEGIN((0x02UL << tmu), nccTable1, 12, 0x0FFF);
+#else
+      REG_GROUP_BEGIN(0x0EUL, nccTable1, 12, 0x0FFF);
+#endif
+      for (i = 0; i < 12; i++) REG_GROUP_SET(texHW, nccTable1[i], table->packed_data[i]);
+      REG_GROUP_END();
+    }
+
+    gc->tmu_state[tmu].ncc_table[which] = table;
+  }
+
+  GR_END();
+#undef FN_NAME
+} /* _grTexDownloadNccTableExt */
+
+/*-------------------------------------------------------------------
+  Function: _grTexDownloadPaletteExt
+  Date: 6/9
+  Implementor(s): jdt
+  Library: Glide
+  Description:
+    Private function to download a palette to the specified tmu
+  Arguments:
+    tmu - which tmu to download the palette to
+    pal - the pallete data
+    start - beginning index to download
+    end   - ending index to download
+  Return:
+    none
+  -------------------------------------------------------------------*/
+GR_DDFUNC(_grTexDownloadPaletteExt,
+          void,
+          (GrChipID_t tmu, GrTexTable_t type, GuTexPalette *pal, int start, int end))
+{
+#define FN_NAME "_grTexDownloadPaletteExt"
+  GR_BEGIN_NOFIFOCHECK(FN_NAME, 89);
+  GDBG_INFO_MORE(gc->myLevel,"(%d,0x%x, %d,%d)\n",tmu,pal,start,end);
+
+  GR_CHECK_F(FN_NAME, pal == NULL, "pal invalid");
+  GR_CHECK_F(FN_NAME, start < 0, "invalid start index");
+  GR_CHECK_F(FN_NAME, end > 255, "invalid end index");
+    
+  /* NOTE:
+  **
+  **  This code broadcasts the palette because in the future, we will
+  **  only support one global texture palette no matter how many TMUs
+  **  there are.  This is fallout from the fact that future hardware
+  **  has a unified memory architecture.
+  **  
+  **  Source licensees (meaning arcade or LBE vendors that) require the
+  **  one palette/tmu mode should define GLIDE_POINTCAST_PALETTE on
+  **  the command line for this file.  Understand, however, that this
+  **  will not work on future hardware.
+  */
+
+#ifdef GLIDE_POINTCAST_PALETTE
+  /*
+  **  FURTHER NOTE:  
+  **  There is a sublety (nice way of saying BUG) here.
+  **  If TMU0 is specified, then the palette will be broadcast to all
+  **  TMUS.  So, if the user downloads TMU1's palette, then TMU0's
+  **  palette, TMU0's palette will be on *both* TMUs.  This is a
+  **  pretty strong indicator that no one is using separate palettes
+  **  on different TMUs.
+  */
+  hw = SST_TMU(hw,tmu);
+#else
+  hw = SST_CHIP(hw,0xE);
+#endif
+
+  _GlideRoot.stats.palDownloads++;
+  _GlideRoot.stats.palBytes += ((end - start + 1) << 2);
+
+  /* We divide the writes into 3 chunks trying to group things into
+   * complete 8 word grouped packets to fit the nccTable palette
+   * format: stuff before the 8 word alignment, aligned writes, and
+   * stuff after the 8 word alignment to the end. The slop regions
+   * are one packet apiece.  
+   */
+  {
+#ifdef GLIDE_POINTCAST_PALETTE
+    const FifoChipField chipId = (FifoChipField)(0x02UL << tmu);
+#else
+    const FifoChipField chipId = (FifoChipField)0x0EUL;
+#endif
+    const int endSlop = (end & ~0x07);
+    const int startSlop = MIN(((start + 8) & ~0x07) - 1, end);
+    int i = start;
+
+    /* Is the start of the palette range unaligned or is the end of
+     * the range less than a completely aligned range?  
+     */
+    if (type == GR_TEXTABLE_PALETTE) {
+      if (((start & 0x07) != 0) || (end < ((start + 8) & ~0x07))) {
+        const FxI32 slopCount = startSlop - start + 1;
+        GR_ASSERT((slopCount > 0) && (slopCount <= 8));
+        
+        REG_GROUP_BEGIN(chipId, nccTable0[4 + (start & 0x07)], 
+                        slopCount, (0xFF >> (8 - slopCount)));
+        while(i < start + slopCount) {
+          REG_GROUP_SET(hw, nccTable0[4 + (i & 0x07)],
+                        (0x80000000 | ((i & 0xFE) << 23) | pal->data[i] & 0xFFFFFF));
+          i++;
+        }
+        REG_GROUP_END();
+      }
+      
+      /* Do all of the aligned palette ranges. */
+      while(i < endSlop) {
+        const int endIndex = i + 8;
+        
+        REG_GROUP_BEGIN(chipId, nccTable0[4], 8, 0xFF);
+        while(i < endIndex) {
+          REG_GROUP_SET(hw, nccTable0[4 + (i & 0x07)],
+                        (0x80000000 | ((i & 0xFE) << 23) | pal->data[i] & 0xFFFFFF));
+          i++;
+        }
+        REG_GROUP_END();
+      }
+      
+      /* Do we have any more slop at the end of the ragne? */
+      if (i <= end) {
+        const FxU32 slopCount = end - endSlop + 1;
+        
+        REG_GROUP_BEGIN(chipId, nccTable0[4], 
+                        slopCount, (0xFF >> (8 - slopCount)));
+        while(i <= end) {
+          REG_GROUP_SET(hw, nccTable0[4 + (i & 0x07)],
+                        (0x80000000 | ((i & 0xFE) << 23) | pal->data[i] & 0xFFFFFF));
+          i++;
+        }
+        REG_GROUP_END();
+      }
+    }
+    else {
+      if (((start & 0x07) != 0) || (end < ((start + 8) & ~0x07))) {
+        const FxI32 slopCount = startSlop - start + 1;
+        GR_ASSERT((slopCount > 0) && (slopCount <= 8));
+        
+        REG_GROUP_BEGIN(chipId, nccTable0[4 + (start & 0x07)], 
+                        slopCount, (0xFF >> (8 - slopCount)));
+        while(i < start + slopCount) {
+          FxU32 p1, p2, p3, p4;
+          p1 = p2 = pal->data[i];
+          p1 &= 0xfc000000;          p2 &= 0x00fc0000;
+          p1 >>= 8;                  p2 >>= 6;
+          p3 = p4 = pal->data[i];
+          p3 &= 0x0000fc00;          p4 &= 0x000000fc;
+          p3 >>= 4;                  p4 >>= 2;
+          p1 |= p2;                  p3 |= p4;              p1 |= p3;
+          REG_GROUP_SET(hw, nccTable0[4 + (i & 0x07)],
+                        (0x80000000 | ((i & 0xFE) << 23) | p1));
+          i++;
+        }
+        REG_GROUP_END();
+      }
+      
+      /* Do all of the aligned palette ranges. */
+      while(i < endSlop) {
+        const int endIndex = i + 8;
+        
+        REG_GROUP_BEGIN(chipId, nccTable0[4], 8, 0xFF);
+        while(i < endIndex) {
+          FxU32 p1, p2, p3, p4;
+          p1 = p2 = pal->data[i];
+          p1 &= 0xfc000000;          p2 &= 0x00fc0000;
+          p1 >>= 8;                  p2 >>= 6;
+          p3 = p4 = pal->data[i];
+          p3 &= 0x0000fc00;          p4 &= 0x000000fc;
+          p3 >>= 4;                  p4 >>= 2;
+          p1 |= p2;                  p3 |= p4;              p1 |= p3;
+          REG_GROUP_SET(hw, nccTable0[4 + (i & 0x07)],
+                        (0x80000000 | ((i & 0xFE) << 23) | p1));
+          i++;
+        }
+        REG_GROUP_END();
+      }
+      
+      /* Do we have any more slop at the end of the ragne? */
+      if (i <= end) {
+        const FxU32 slopCount = end - endSlop + 1;
+        
+        REG_GROUP_BEGIN(chipId, nccTable0[4], 
+                        slopCount, (0xFF >> (8 - slopCount)));
+        while(i <= end) {
+          FxU32 p1, p2, p3, p4;
+          p1 = p2 = pal->data[i];
+          p1 &= 0xfc000000;          p2 &= 0x00fc0000;
+          p1 >>= 8;                  p2 >>= 6;
+          p3 = p4 = pal->data[i];
+          p3 &= 0x0000fc00;          p4 &= 0x000000fc;
+          p3 >>= 4;                  p4 >>= 2;
+          p1 |= p2;                  p3 |= p4;              p1 |= p3;
+          REG_GROUP_SET(hw, nccTable0[4 + (i & 0x07)],
+                        (0x80000000 | ((i & 0xFE) << 23) | p1));
+          i++;
+        }
+        REG_GROUP_END();
+      }
+    }
+  }
+    
+  GR_END();
+#undef FN_NAME
+} /* _grTexDownloadPaletteExt */
+
+/*-------------------------------------------------------------------
+  Function: grTexDownloadTableExt
+  Date: 6/3
+  Implementor(s): jdt, GaryMcT
+  Library: glide
+  Description:
+    download look up table data to a tmu
+  Arguments:
+    tmu - which tmu
+    type - what type of table to download
+        One of:
+            GR_TEXTABLE_NCC0
+            GR_TEXTABLE_NCC1
+            GR_TEXTABLE_PALETTE
+    void *data - pointer to table data
+  Return:
+    none
+  -------------------------------------------------------------------*/
+GR_ENTRY(grTexDownloadTableExt,
+         void,
+         (GrChipID_t tmu, GrTexTable_t type,  void *data))
+{
+  GR_BEGIN_NOFIFOCHECK("grTexDownloadTableExt",89);
+  GDBG_INFO_MORE(gc->myLevel,"(%d,%d,0x%x)\n",tmu,type,data);
+  GR_CHECK_TMU(FN_NAME,tmu);
+  GR_CHECK_F(myName, type > GR_TEXTABLE_PALETTE_6666_EXT, "invalid table specified");
+  GR_CHECK_F(myName, !data, "invalid data pointer");
+
+  gc->state.tex_table = type;
+
+  if ((type == GR_TEXTABLE_PALETTE) || (type == GR_TEXTABLE_PALETTE_6666_EXT))     /* Need Palette Download Code */
+    _grTexDownloadPaletteExt(tmu, type, (GuTexPalette *)data, 0, 255);
+  else {                                 /* Type is an ncc table */
+    _grTexDownloadNccTableExt(tmu, type, (GuNccTable*)data, 0, 11);
+    /*    _grTexDownloadNccTable(tmu, type, (GuNccTable*)data, 0, 11); */
+  }
+  GR_END();
+} /* grTexDownloadTableExt */
+
+#undef GLIDE_POINTCAST_PALETTE
+
 /*---------------------------------------------------------------------------
 ** _grTexDownloadNccTable
 **
@@ -440,7 +731,6 @@ GR_DDFUNC(_grTexDownloadPalette,
   Return:
     none
   -------------------------------------------------------------------*/
-#if defined(GLIDE3) && defined(GLIDE3_ALPHA)
 GR_ENTRY(grTexDownloadTable,
          void,
          (GrTexTable_t type,  void *data))
@@ -460,27 +750,6 @@ GR_ENTRY(grTexDownloadTable,
   }
   GR_END();
 } /* grTexDownloadTable */
-#else
-GR_ENTRY(grTexDownloadTable,
-         void,
-         (GrChipID_t tmu, GrTexTable_t type,  void *data))
-{
-  GR_BEGIN_NOFIFOCHECK("grTexDownloadTable",89);
-  GDBG_INFO_MORE(gc->myLevel,"(%d,%d,0x%x)\n",tmu,type,data);
-  GR_CHECK_TMU(FN_NAME,tmu);
-  GR_CHECK_F(FN_NAME, type > GR_TEXTABLE_PALETTE, "invalid table specified");
-  GR_CHECK_F(FN_NAME, !data, "invalid data pointer");
-
-  if ((type == GR_TEXTABLE_PALETTE) && (GR_TEXTABLE_PALETTE_6666))     /* Need Palette Download Code */
-    _grTexDownloadPalette(tmu, type, (GuTexPalette *)data, 0, 255);
-  else {                                 /* Type is an ncc table */
-    _grTexDownloadNccTable(tmu, type, (GuNccTable*)data, 0, 11);
-    /*    _grTexDownloadNccTable(tmu, type, (GuNccTable*)data, 0, 11); */
-  }
-  GR_END();
-#undef FN_NAME
-} /* grTexDownloadTable */
-#endif
 
 
 /*-------------------------------------------------------------------
diff --git a/glide3x/cvg/glide3/src/xdraw2.asm b/glide3x/cvg/glide3/src/xdraw2.asm
index b452706..43fa5b0 100644
--- a/glide3x/cvg/glide3/src/xdraw2.asm
+++ b/glide3x/cvg/glide3/src/xdraw2.asm
@@ -19,6 +19,9 @@
 ;; $Header$
 ;; $Revision$
 ;; $Log$
+;; Revision 1.1.1.1  1999/12/07 21:42:35  joseph
+;; Initial checkin into SourceForge.
+;;
 ;; 
 ;; 1     10/08/98 11:30a Brent
 ;; 
@@ -60,117 +63,127 @@
 ; B4 Chip field fix.
 ;;
 
-TITLE   xdraw2.asm
-OPTION OLDSTRUCTS       
+%include "xos.inc"
 
-.586P
-.MMX
-.K3D
+extrn   _GlideRoot
+extrn   _FifoMakeRoom
+    
+%MACRO GR_FIFO_WRITE 3
+    mov     [%1 + %2], %3
+%ENDMACRO ; GR_FIFO_WRITE
 
-EXTRN   __GlideRoot	    : DWORD
-EXTRN   __FifoMakeRoom	    : NEAR
+%MACRO WRITE_MM1_FIFO_ALIGNED 1
 
-_DATA   SEGMENT
-    One         DD  03f800000r
+; 3DNow!
+%ifdef GL_AMD3D
+    movq      [fifo+%1], mm1        ; store current param | previous param
+%endif
+
+%ENDMACRO ; WRITE_MM1_FIFO_ALIGNED
+
+%MACRO WRITE_MM1LOW_FIFO 0
+
+; 3DNow
+%ifdef GL_AMD3D
+    movd      [fifo], mm1           ; store current param | previous param
+%endif
+
+%ENDMACRO ; WRITE_MM1LOW_FIFO
+
+segment		DATA
+    One         DD  1.0
     Area        DD  0
-_DATA   ENDS    
 
 ;;; Definitions of cvg regs and glide root structures.
-INCLUDE fxgasm.h
+%INCLUDE "fxgasm.h"
 
 ;; enables/disables trisProcessed and trisDrawn counters
-STATS = 1
+%define STATS 1
 
 ; Arguments (STKOFF = 16 from 4 pushes)
-STKOFF  = 16
-_va$    =  4 + STKOFF
-_vb$    =  8 + STKOFF
-_vc$    = 12 + STKOFF    
+STKOFF  equ 16
+_va$    equ  4 + STKOFF
+_vb$    equ  8 + STKOFF
+_vc$    equ 12 + STKOFF
 
     ;; coordinate offsets into vertex.
     ;; NB:  These are constants and are not
     ;;	    user settable like the rest of the
     ;;	    parameter offset. Weird.
-X       = 0
-Y       = 4
+X       equ 0
+Y       equ 4
 
-CONST   SEGMENT
-$T2003  DD  046400000r          ; 12288
-$T2005  DD  03f800000r          ; 1
-$T2006  DD  043800000r          ; 256
-CONST   ENDS
+segment		CONST
+T2003  DD  12288.0      ; 12288
+T2005  DD  1.0          ; 1
+T2006  DD  256.0        ; 256
 
-PROC_TYPE MACRO procType:=<Default>
-    IFDEF GL_AMD3D
-	EXITM <__trisetup_3DNow_&procType&@12>
-    ELSE
-	EXITM <__trisetup_Default_&procType&@12>
-    ENDIF
-    ENDM    
+%MACRO PROC_TYPE 1
+    %IFDEF GL_AMD3D
+        proc _trisetup_3DNow_%1, 12
+    %ELSE
+        proc _trisetup_Default_%1, 12
+    %ENDIF
+%ENDM
 
 ;--------------------------------------------------------------------------
 
-_TEXT       SEGMENT PAGE PUBLIC USE32 'CODE'
-            ASSUME DS: FLAT, SS: FLAT
+segment		TEXT
 
             ALIGN  32
 
-            PUBLIC  PROC_TYPE(cull)
-PROC_TYPE(cull)  PROC    NEAR
+PROC_TYPE cull
 
-GLIDE_CULLING       textequ <1>
-GLIDE_PACK_RGB      textequ <0>
-GLIDE_PACK_ALPHA    textequ <0>
-GLIDE_GENERIC_SETUP textequ <0>
-INCLUDE xdraw2.inc
-GLIDE_GENERIC_SETUP textequ <0>    
-GLIDE_PACK_ALPHA    textequ <0>
-GLIDE_PACK_RGB      textequ <0>    
-GLIDE_CULLING       textequ <0>
+%define GLIDE_CULLING       1
+%define GLIDE_PACK_RGB      0
+%define GLIDE_PACK_ALPHA    0
+%define GLIDE_GENERIC_SETUP 0
+%INCLUDE "xdraw2.inc"
+%undef GLIDE_GENERIC_SETUP
+%undef GLIDE_PACK_ALPHA
+%undef GLIDE_PACK_RGB
+%undef GLIDE_CULLING
 
-PROC_TYPE(cull) ENDP
+endp
 
             ALIGN  32
 
-            PUBLIC  PROC_TYPE()
-PROC_TYPE()  PROC    NEAR
+PROC_TYPE Default
 
-GLIDE_CULLING       textequ <0>
-GLIDE_PACK_RGB      textequ <0>
-GLIDE_PACK_ALPHA    textequ <0>
-GLIDE_GENERIC_SETUP textequ <0>
-INCLUDE xdraw2.inc
-GLIDE_GENERIC_SETUP textequ <0>    
-GLIDE_PACK_ALPHA    textequ <0>
-GLIDE_PACK_RGB      textequ <0>    
-GLIDE_CULLING       textequ <0>
+%define GLIDE_CULLING       0
+%define GLIDE_PACK_RGB      0
+%define GLIDE_PACK_ALPHA    0
+%define GLIDE_GENERIC_SETUP 0
+%INCLUDE "xdraw2.inc"
+%undef GLIDE_GENERIC_SETUP
+%undef GLIDE_PACK_ALPHA
+%undef GLIDE_PACK_RGB
+%undef GLIDE_CULLING
 
-PROC_TYPE() ENDP
+endp
 
-IFNDEF GL_AMD3D    
+%IFNDEF GL_AMD3D
 	    ALIGN   32
-	    PUBLIC  __trisetup_clip_coor_thunk@12
-__trisetup_clip_coor_thunk@12 PROC NEAR
+proc _trisetup_clip_coor_thunk, 12
 
-gc	TEXTEQU	<eax>		; Current graphics context
-procPtr TEXTEQU <ebx>    
-vPtr	TEXTEQU	<ecx>
+%define gc	eax		; Current graphics context
+%define procPtr edx
+%define vPtr	ecx
     
     ;; Call through to the gc->curArchProcs.drawTrianglesProc w/o
     ;; adding extra stuff to the stack. I wish we could actually
     ;; do a direct return here w/o too much work.
     lea	    vPtr, [esp + _va$ - STKOFF]	; Get vertex pointer address
-    mov     gc, [__GlideRoot + curGC]; GR_DCL_GC
+    mov     gc, [_GlideRoot + curGC]; GR_DCL_GC
 
     ;; If debugging make sure that we're in clip coordinates
-IFDEF GLIDE_DEBUG
-    mov     ebx, [gc + CoordinateSpace]
-    test    ebx, 1
+%IFDEF GLIDE_DEBUG
+    test    dword [gc + CoordinateSpace], 1
     jnz	    __clipSpace
     xor	    eax, eax
     mov	    [eax], eax
 __clipSpace:    
-ENDIF ; GLIDE_DEBUG
+%ENDIF ; GLIDE_DEBUG
 
     mov	    procPtr, [gc + drawTrianglesProc]; Prefetch drawTriangles proc addr
     push    vPtr		; vertex array address
@@ -180,10 +193,7 @@ ENDIF ; GLIDE_DEBUG
 
     call    procPtr		; (*gc->curArchProcs.drawTrianglesProc)(grDrawVertexArray, 3, vPtr)
 
-    ret	    12			; pop 3 dwords (vertex addrs) and return    
-__trisetup_clip_coor_thunk@12 ENDP
+    ret				; pop 3 dwords (vertex addrs) and return
+endp
 
-ENDIF ; !GL_AMD3D
-    
-_TEXT	ENDS
-	END
+%ENDIF ; !GL_AMD3D
diff --git a/glide3x/cvg/glide3/src/xdraw2.inc b/glide3x/cvg/glide3/src/xdraw2.inc
index b8802c1..9dc2f89 100644
--- a/glide3x/cvg/glide3/src/xdraw2.inc
+++ b/glide3x/cvg/glide3/src/xdraw2.inc
@@ -20,6 +20,9 @@
 ;; $Header$
 ;; $Revision$
 ;; $Log$
+;; Revision 1.1.1.1  1999/12/07 21:42:35  joseph
+;; Initial checkin into SourceForge.
+;;
 ; 
 ; 2     10/30/97 6:53p Peter
 ; first real cut at tri asm
@@ -38,42 +41,26 @@
 ;; AMD3D version
 ;;--------------------------------------------------------------------------
 
-ifdef GL_AMD3D
+%ifdef GL_AMD3D
 
-TITLE   xdraw2.inc
-
-GR_FIFO_WRITE   MACRO __addr, __offset, __data
-    mov    [__addr + __offset], __data
-ENDM ; GR_FIFO_WRITE
-
-
-WRITE_MM1_FIFO_ALIGNED MACRO __offset
-    movq      [fifo+__offset], mm1  ; store current param | previous param
-ENDM ; WRITE_MM1_FIFO_ALIGNED
-
-
-WRITE_MM1LOW_FIFO MACRO
-    movd      [fifo], mm1           ; store current param | previous param
-ENDM ; WRITE_MM1LOW_FIFO
-
-gc      TEXTEQU     <edi>           ; points to graphics context
-fifo    TEXTEQU     <ebp>           ; points to fifo entries
-tempVal TEXTEQU     <esi>
+%define gc      edi           ; points to graphics context
+%define fifo    ebp           ; points to fifo entries
+%define tempVal esi
 
     ;; Prologue stuff
     push      edi                   ; save caller's register variable
-    mov       gc,[__GlideRoot+curGC]; GR_DCL_GC
+    mov       gc,[_GlideRoot+curGC]; GR_DCL_GC
 
     push      ebp                   ; save frame pointer
     push      ebx                   ; save caller's register variable        
     
-IF GLIDE_CULLING
-fa      TEXTEQU     <eax>           ; vtx a from caller
-fb      TEXTEQU     <ebx>           ; vtx b from caller
-fc      TEXTEQU     <ecx>           ; vtx c from caller
+%IF GLIDE_CULLING
+%define fa      eax           ; vtx a from caller
+%define fb      ebx           ; vtx b from caller
+%define fc      ecx           ; vtx c from caller
     
-cull    TEXTEQU     <edx>           ; cull mode
-intArea TEXTEQU     <ecx>           ; area temp storage
+%define cull    edx           ; cull mode
+%define intArea ecx           ; area temp storage
     
     mov       fb, [esp + _vb$ - 4]  ; get base address of vertex B
     push      esi                   ; save caller's register variable
@@ -84,10 +71,10 @@ intArea TEXTEQU     <ecx>           ; area temp storage
     femms                           ; will use AMD3D, clear FPU/MMX registers
 
     cmp       cull, 0               ; culling enabled ?
-    mov       tempVal, [__GlideRoot + curTriSize]
+    mov       tempVal, [_GlideRoot + curTriSize]
 
     ;; Cull Check
-    jz        nocull                ; nope, no culling
+    jz        .nocull                ; nope, no culling
     mov       fa, [esp + _va$]      ; get base address of vertex A
 
     movq      mm2, [fc + X]         ; yc | xc
@@ -126,20 +113,20 @@ intArea TEXTEQU     <ecx>           ; area temp storage
     jge       __cullFail            ; triangle facing away from viewer, culled
 
     cmp       ebx, tempVal          ; fifo space required >= space available ?
-    jge       __triBegin            ; yup, push out triangle data to Voodoo
+    jge       .__triBegin            ; yup, push out triangle data to Voodoo
 
-    push      @Line                 ; line number inside this function
+    push      __LINE__              ; line number inside this function
     push      0h                    ; pointer to function name = NULL
 
     push      tempVal               ; fifo space required
-    call      __FifoMakeRoom        ; note: updates fifoPtr
+    call      _FifoMakeRoom        ; note: updates fifoPtr
 
     add       esp, 12               ; remove 3 DWORD arguments from stack
-    jmp       __triBegin            ; merge back with short path
+    jmp       .__triBegin            ; merge back with short path
 
     ;; culling disabled
 
-nocull:
+.nocull:
     ;; Check to make sure that we have enough room for
     ;; the complete triangle packet.
 
@@ -147,48 +134,48 @@ nocull:
     mov       ebx, [gc + fifoRoom]  ; fifo space available
 
     cmp       ebx, tempVal          ; fifo spce available >= space needed ?
-    jge       __triBegin            ; yup, ready to draw triangle
+    jge       .__triBegin            ; yup, ready to draw triangle
 
-    push      @Line                 ; line number inside this function
+    push      __LINE__              ; line number inside this function
     push      0h                    ; pointer to function name = NULL
 
     push      tempVal               ; fifo space needed
-    call      __FifoMakeRoom        ; note: updates fifoPtr
+    call      _FifoMakeRoom        ; note: updates fifoPtr
 
     add       esp, 12               ; remove 3 DWORD arguments from stack
     nop                             ; filler
 
-ELSE   ; !GLIDE_CULLING
+%ELSE   ; !GLIDE_CULLING
 
     lea       eax, [esp+ _va$]      ; pointer to vertex pointers    
     push      esi                   ; save caller's register variable
     
-    mov       tempVal, [__GlideRoot + curTriSize] ; data for whole triangle in bytes
+    mov       tempVal, [_GlideRoot + curTriSize] ; data for whole triangle in bytes
     mov       ebx, [gc + fifoRoom]  ; fifo space available
 
     add       tempVal, 4            ; fifo space needed (include 4-byte header)
     femms                           ; will use AMD3D, clear FPU/MMX registers
 
     cmp       ebx, tempVal          ; fifo spce available >= space needed ?
-    jge       __triBegin            ; yup, ready to draw triangle
+    jge       .__triBegin            ; yup, ready to draw triangle
 
-    push      @Line                 ; line number inside this function
+    push      __LINE__              ; line number inside this function
     push      0h                    ; pointer to function name = NULL
 
     push      tempVal               ; fifo space needed
-    call      __FifoMakeRoom        ; note: updates fifoPtr
+    call      _FifoMakeRoom        ; note: updates fifoPtr
 
     add       esp, 12               ; remove 3 DWORD arguments from stack
     nop                             ; filler
-ENDIF  ; GLIDE_CULLING
+%ENDIF  ; GLIDE_CULLING
 
 
-dlp     TEXTEQU     <ebx>           ; points to dataList structure
-dlpstrt TEXTEQU     <ecx>           ; points to begin of dataList structure
-vertex  TEXTEQU     <edx>           ; the current vertex
-packCol TEXTEQU     <esi>
+%define dlp     ebx           ; points to dataList structure
+%define dlpstrt ecx           ; points to begin of dataList structure
+%define vertex  edx           ; the current vertex
+%define packCol esi
 
-__triBegin:
+.__triBegin:
     mov       eax, [gc+triPacketHdr]; Packet 3 header
     lea       dlp,[gc + tsuDataList]; Reset the dataList
 
@@ -198,7 +185,7 @@ __triBegin:
     mov       dlpstrt, dlp          ; save pointer to start of dataList
     test      fifo, 4               ; is fifo pointer qword aligned ?
 
-    jz        __fifo_aligned        ; yes, it is qword aligned
+    jz        .__fifo_aligned        ; yes, it is qword aligned
     movq      mm1, [vertex+X]       ; y | x
 
     GR_FIFO_WRITE fifo, 0, eax      ; write header to fifo; now qword aligned
@@ -207,8 +194,8 @@ __triBegin:
     WRITE_MM1_FIFO_ALIGNED -8       ; PCI write y | x
     nop                             ; filler
 
-IF GLIDE_PACK_RGB
-IF GLIDE_PACK_ALPHA
+%IF GLIDE_PACK_RGB
+%IF GLIDE_PACK_ALPHA
     ;; assumes color and alpha values < 256.0
     movq      mm1, [vertex+r]       ; g | r
     movd      mm2, [vertex+b]       ; 0 | b
@@ -227,7 +214,7 @@ IF GLIDE_PACK_ALPHA
 
     por       mm1, mm2              ; 00000000 | 00rrggbb
     por       mm1, mm3              ; 00000000 | aarrggbb
-ELSE ; !GLIDE_PACK_ALPHA
+%ELSE ; !GLIDE_PACK_ALPHA
     ;; assumes color values < 256.0
 
     movq      mm1, [vertex+r]       ; g | r
@@ -241,7 +228,7 @@ ELSE ; !GLIDE_PACK_ALPHA
 
     psrlq     mm1, 24               ; 00000000 | 0000gg00
     por       mm1, mm2              ; 00000000 | 00rrggbb
-ENDIF ; !GLIDE_PACK_ALPHA
+%ENDIF ; !GLIDE_PACK_ALPHA
 
     ;; here: one DWORD in "write buffer", RGB(A)
 
@@ -273,7 +260,7 @@ __paramLoop1a:
     nop                             ; filler
     jmp       __paramLoopDoneWBone1 ; merge back into common stream
 
-ELSE ; ! GLIDE_PACK_RGB
+%ELSE ; ! GLIDE_PACK_RGB
 
     ;; here: "write buffer" empty
 
@@ -281,14 +268,14 @@ ELSE ; ! GLIDE_PACK_RGB
     test      eax, eax              ; at end of list ?
 
     lea       dlp, [dlp+4]          ; dlp++
-    jz        __paramLoopDoneWBzero1; yes, "write buffer" empty
+    jz        .__paramLoopDoneWBzero1; yes, "write buffer" empty
           
-__paramLoop1a:
+.__paramLoop1a:
     movd      mm1, [eax+vertex]     ; get next parameter
     mov       eax, [dlp]            ; offset = *(dlp + 1)
 
     test      eax, eax              ; at end of offset list (offset == 0) ?
-    jz        __paramLoopDoneWBone1 ; exit, write buffer contains one DWORD
+    jz        .__paramLoopDoneWBone1 ; exit, write buffer contains one DWORD
 
     movd      mm2, [eax+vertex]     ; get next parameter
     add       dlp, 8                ; dlp += 2
@@ -300,14 +287,14 @@ __paramLoop1a:
     test      eax, eax              ; at end of offset list (offset == 0) ?
 
     WRITE_MM1_FIFO_ALIGNED -8       ; PCI write current param | previous param
-    jnz       __paramLoop1a         ; nope, copy next parameter
+    jnz       .__paramLoop1a         ; nope, copy next parameter
 
     nop                             ; filler
-    jmp       __paramLoopDoneWBzero1; write buffer empty
+    jmp       .__paramLoopDoneWBzero1; write buffer empty
     
-ENDIF ; GLIDE_PACK_RGB
+%ENDIF ; GLIDE_PACK_RGB
 
-__fifo_aligned:
+.__fifo_aligned:
     movd      mm2, [vertex+X]       ; y | x of vertex A
     add       fifo, 8               ; fifoPtr += 2*sizeof(FxU32)
 
@@ -317,8 +304,8 @@ __fifo_aligned:
     WRITE_MM1_FIFO_ALIGNED -8       ; PCI write x | header
     movd      mm1, [vertex+Y]       ; 0 | y of vertex A
 
-IF GLIDE_PACK_RGB
-IF GLIDE_PACK_ALPHA
+%IF GLIDE_PACK_RGB
+%IF GLIDE_PACK_ALPHA
     ;; assumes color and alpha values < 256.0
     movq      mm4, [vertex+r]       ; g | r
     movd      mm2, [vertex+b]       ; 0 | b
@@ -337,7 +324,7 @@ IF GLIDE_PACK_ALPHA
 
     por       mm4, mm2              ; 00000000 | 00rrggbb
     por       mm4, mm3              ; 00000000 | aarrggbb
-ELSE ; !GLIDE_PACK_ALPHA
+%ELSE ; !GLIDE_PACK_ALPHA
     ;; assumes color values < 256.0
 
     movq      mm4, [vertex+r]       ; g | r
@@ -351,7 +338,7 @@ ELSE ; !GLIDE_PACK_ALPHA
 
     psrlq     mm4, 24               ; 00000000 | 0000gg00
     por       mm4, mm2              ; 00000000 | 00rrggbb
-ENDIF ; !GLIDE_PACK_ALPHA
+%ENDIF ; !GLIDE_PACK_ALPHA
 
     punpckldq mm1, mm4              ; RGB(A) | y
     mov       eax, [dlp]            ; get first offset from the data list
@@ -387,14 +374,14 @@ __paramLoop1b:
     nop                             ; filler
     jmp       __paramLoopDoneWBzero1; write buffer empty
 
-ELSE ; !GLIDE_PACK_RGB
+%ELSE ; !GLIDE_PACK_RGB
     mov       eax, [dlp]            ; get first offset from the data list
     add       dlp, 4                ; dlp++
 
     cmp       eax, 0                ; end of list ?
-    jz        __paramLoopDoneWBone1 ; yes, "write buffer" has y data
+    jz        .__paramLoopDoneWBone1 ; yes, "write buffer" has y data
 
-__paramLoop1b:
+.__paramLoop1b:
     movd      mm2, [eax+vertex]     ; get next parameter
     add       fifo, 8               ; fifoPtr += 2*sizeof(FxU32)
 
@@ -405,16 +392,16 @@ __paramLoop1b:
     test      eax, eax              ; at end of offset list (offset == 0) ?
   
     WRITE_MM1_FIFO_ALIGNED -8       ; PCI write current param | previous param
-    jz        __paramLoopDoneWBzero1; exit, "write buffer" empty
+    jz        .__paramLoopDoneWBzero1; exit, "write buffer" empty
 
     movd      mm1, [eax+vertex]     ; get next parameter
     mov       eax, [dlp-4]          ; offset = *(dlp + 1)
 
     test      eax, eax              ; at end of offset list (offset == 0) ?
-    jnz       __paramLoop1b         ; nope, copy next parameter
-ENDIF
+    jnz       .__paramLoop1b         ; nope, copy next parameter
+%ENDIF
 
-__paramLoopDoneWBone1:
+.__paramLoopDoneWBone1:
 
     ;; here: "write buffer" has one DWORD left over from vertex A
 
@@ -430,8 +417,8 @@ __paramLoopDoneWBone1:
     movd      mm1, [vertex+Y]       ; 0 | y of vertex B
     nop                             ; filler
 
-IF GLIDE_PACK_RGB
-IF GLIDE_PACK_ALPHA
+%IF GLIDE_PACK_RGB
+%IF GLIDE_PACK_ALPHA
     ;; assumes color and alpha values < 256.0
     movq      mm4, [vertex+r]       ; g | r
     movd      mm2, [vertex+b]       ; 0 | b
@@ -450,7 +437,7 @@ IF GLIDE_PACK_ALPHA
 
     por       mm4, mm2              ; 00000000 | 00rrggbb
     por       mm4, mm3              ; 00000000 | aarrggbb
-ELSE ; !GLIDE_PACK_ALPHA
+%ELSE ; !GLIDE_PACK_ALPHA
     ;; assumes color values < 256.0
 
     movq      mm4, [vertex+r]       ; g | r
@@ -464,7 +451,7 @@ ELSE ; !GLIDE_PACK_ALPHA
 
     add       dlp, 4                ; next data list entry
     por       mm4, mm2              ; 00000000 | 00rrggbb
-ENDIF ; !GLIDE_PACK_ALPHA
+%ENDIF ; !GLIDE_PACK_ALPHA
 
     punpckldq mm1, mm4              ; RGB(A) | y
     mov       eax, [dlp]            ; get first offset from the data list
@@ -500,14 +487,14 @@ __paramLoop2b:
     nop                             ; filler
     jmp       __paramLoopDoneWBzero2; write buffer empty
 
-ELSE ; !GLIDE_PACK_RGB
+%ELSE ; !GLIDE_PACK_RGB
     mov       eax, [dlp]            ; get first offset from the data list
     add       dlp, 4                ; dlp++
 
     test      eax, eax              ; end of list ?
-    jz        __paramLoopDoneWBone2 ; yes, "write buffer" has y data
+    jz        .__paramLoopDoneWBone2 ; yes, "write buffer" has y data
 
-__paramLoop2b:
+.__paramLoop2b:
     movd      mm2, [eax+vertex]     ; get next parameter
     add       fifo, 8               ; fifoPtr += 2*sizeof(FxU32)
 
@@ -518,20 +505,20 @@ __paramLoop2b:
     test      eax, eax              ; at end of offset list (offset == 0) ?
   
     WRITE_MM1_FIFO_ALIGNED -8       ; PCI write current param | previous param
-    jz        __paramLoopDoneWBzero2; exit, "write buffer" empty
+    jz        .__paramLoopDoneWBzero2; exit, "write buffer" empty
 
     movd      mm1, [eax+vertex]     ; get next parameter
     mov       eax, [dlp-4]          ; offset = *(dlp + 1)
 
     test      eax, eax              ; at end of offset list (offset == 0) ?
-    jnz       __paramLoop2b         ; nope, copy next parameter
+    jnz       .__paramLoop2b         ; nope, copy next parameter
 
     nop                             ; filler
-    jmp       __paramLoopDoneWBone2 ; write buffer contains one DWORD
-ENDIF
+    jmp       .__paramLoopDoneWBone2 ; write buffer contains one DWORD
+%ENDIF
 
 
-__paramLoopDoneWBzero1:
+.__paramLoopDoneWBzero1:
 
     mov       vertex, [esp + _vb$]  ; Current vertex = B
     mov       dlp, dlpstrt          ; Reset the dataList
@@ -542,8 +529,8 @@ __paramLoopDoneWBzero1:
     WRITE_MM1_FIFO_ALIGNED -8       ; PCI write y | x of vertex B
     nop                             ; filler
 
-IF GLIDE_PACK_RGB
-IF GLIDE_PACK_ALPHA
+%IF GLIDE_PACK_RGB
+%IF GLIDE_PACK_ALPHA
     ;; assumes color and alpha values < 256.0
     movq      mm1, [vertex+r]       ; g | r
     movd      mm2, [vertex+b]       ; 0 | b
@@ -562,7 +549,7 @@ IF GLIDE_PACK_ALPHA
 
     por       mm1, mm3              ; 00000000 | aarrggbb
     add       dlp, 8                ; skip data list entry "a"
-ELSE ; !GLIDE_PACK_ALPHA
+%ELSE ; !GLIDE_PACK_ALPHA
     ;; assumes color values < 256.0
 
     movq      mm1, [vertex+r]       ; g | r
@@ -576,11 +563,11 @@ ELSE ; !GLIDE_PACK_ALPHA
 
     por       mm1, mm2              ; 00000000 | 00rrggbb
     add       dlp, 4                ; next data list entry
-ENDIF ; !GLIDE_PACK_ALPHA
+%ENDIF ; !GLIDE_PACK_ALPHA
 
     ;; here: one DWORD in "write buffer", RGB(A)
 
-    mov       eax, DWORD PTR [dlp]  ; get first offset from the data list
+    mov       eax, dword [dlp]  ; get first offset from the data list
     add       dlp, 4                ; dlp++
 
     test      eax, eax              ; end of list ?
@@ -608,7 +595,7 @@ __paramLoop2a:
     nop                             ; filler
     jmp       __paramLoopDoneWBone2 ; merge back into common stream
 
-ELSE ; ! GLIDE_PACK_RGB
+%ELSE ; ! GLIDE_PACK_RGB
 
     ;; here: "write buffer" empty
 
@@ -616,14 +603,14 @@ ELSE ; ! GLIDE_PACK_RGB
     add       dlp, 4                ; dlp++
 
     cmp       eax, 0                ; at end of list ?
-    jz        __paramLoopDoneWBzero2; yes, "write buffer" empty
+    jz        .__paramLoopDoneWBzero2; yes, "write buffer" empty
           
-__paramLoop2a:
+.__paramLoop2a:
     movd      mm1, [eax+vertex]     ; get next parameter
     mov       eax, [dlp]            ; offset = *(dlp + 1)
 
     test      eax, eax              ; at end of offset list (offset == 0) ?
-    jz        __paramLoopDoneWBone2 ; exit, write buffer contains one DWORD
+    jz        .__paramLoopDoneWBone2 ; exit, write buffer contains one DWORD
 
     movd      mm2, [eax+vertex]     ; get next parameter
     add       dlp, 8                ; dlp += 2
@@ -635,12 +622,12 @@ __paramLoop2a:
     test      eax, eax              ; at end of offset list (offset == 0) ?
 
     WRITE_MM1_FIFO_ALIGNED -8       ; PCI write current param | previous param
-    jnz       __paramLoop2a         ; nope, copy next parameter
+    jnz       .__paramLoop2a         ; nope, copy next parameter
 
-ENDIF ; GLIDE_PACK_RGB
+%ENDIF ; GLIDE_PACK_RGB
 
 
-__paramLoopDoneWBzero2:
+.__paramLoopDoneWBzero2:
 
     mov       vertex, [esp + _vc$]  ; Current vertex = C
     mov       dlp, dlpstrt          ; Reset the dataList
@@ -652,8 +639,8 @@ __paramLoopDoneWBzero2:
     nop                             ; filler
 
 
-IF GLIDE_PACK_RGB
-IF GLIDE_PACK_ALPHA
+%IF GLIDE_PACK_RGB
+%IF GLIDE_PACK_ALPHA
     ;; assumes color and alpha values < 256.0
     movq      mm1, [vertex+r]       ; g | r
     movd      mm2, [vertex+b]       ; 0 | b
@@ -672,7 +659,7 @@ IF GLIDE_PACK_ALPHA
 
     por       mm1, mm3              ; 00000000 | aarrggbb
     add       dlp, 8                ; skip data list entry "a"
-ELSE ; !GLIDE_PACK_ALPHA
+%ELSE ; !GLIDE_PACK_ALPHA
     ;; assumes color values < 256.0
 
     movq      mm1, [vertex+r]       ; g | r
@@ -686,7 +673,7 @@ ELSE ; !GLIDE_PACK_ALPHA
 
     por       mm1, mm2              ; 00000000 | 00rrggbb
     add       dlp, 4                ; next data list entry
-ENDIF ; !GLIDE_PACK_ALPHA
+%ENDIF ; !GLIDE_PACK_ALPHA
 
     ;; here: one DWORD in "write buffer", RGB(A)
 
@@ -718,7 +705,7 @@ __paramLoop3a:
     nop                             ; filler
     jmp       __paramLoopDoneWBone3 ; merge back into common stream
 
-ELSE ; ! GLIDE_PACK_RGB
+%ELSE ; ! GLIDE_PACK_RGB
 
     ;; here: "write buffer" empty
 
@@ -726,14 +713,14 @@ ELSE ; ! GLIDE_PACK_RGB
     add       dlp, 4                ; dlp++
 
     test      eax, eax              ; at end of list ?
-    jz        __paramLoopDoneWBzero3; yes, "write buffer" empty
+    jz        .__paramLoopDoneWBzero3; yes, "write buffer" empty
           
-__paramLoop3a:
+.__paramLoop3a:
     movd      mm1, [eax+vertex]     ; get next parameter
     mov       eax, [dlp]            ; offset = *(dlp + 1)
 
     test      eax, eax              ; at end of offset list (offset == 0) ?
-    jz        __paramLoopDoneWBone3 ; exit, write buffer contains one DWORD
+    jz        .__paramLoopDoneWBone3 ; exit, write buffer contains one DWORD
 
     movd      mm2, [eax+vertex]     ; get next parameter
     add       dlp, 8                ; dlp += 2
@@ -745,15 +732,15 @@ __paramLoop3a:
     test      eax, eax              ; at end of offset list (offset == 0) ?
 
     WRITE_MM1_FIFO_ALIGNED -8       ; PCI write current param | previous param
-    jnz       __paramLoop3a         ; nope, copy next parameter
+    jnz       .__paramLoop3a         ; nope, copy next parameter
 
     mov       esp, esp              ; filler
-    jmp       __paramLoopDoneWBzero3; write buffer empty
+    jmp       .__paramLoopDoneWBzero3; write buffer empty
 
-ENDIF ; GLIDE_PACK_RGB
+%ENDIF ; GLIDE_PACK_RGB
 
 
-__paramLoopDoneWBone2:
+.__paramLoopDoneWBone2:
 
     ;; here: "write buffer" has one DWORD left over from vertex B
 
@@ -769,8 +756,8 @@ __paramLoopDoneWBone2:
     movd      mm1, [vertex+Y]       ; 0 | y of vertex C
     mov       esp, esp              ; filler
 
-IF GLIDE_PACK_RGB
-IF GLIDE_PACK_ALPHA
+%IF GLIDE_PACK_RGB
+%IF GLIDE_PACK_ALPHA
     ;; assumes color and alpha values < 256.0
     movq      mm4, [vertex+r]       ; g | r
     movd      mm2, [vertex+b]       ; 0 | b
@@ -789,7 +776,7 @@ IF GLIDE_PACK_ALPHA
 
     por       mm4, mm2              ; 00000000 | 00rrggbb
     por       mm4, mm3              ; 00000000 | aarrggbb
-ELSE ; !GLIDE_PACK_ALPHA
+%ELSE ; !GLIDE_PACK_ALPHA
     ;; assumes color values < 256.0
 
     movq      mm4, [vertex+r]       ; g | r
@@ -803,7 +790,7 @@ ELSE ; !GLIDE_PACK_ALPHA
 
     add       dlp, 4                ; next data list entry
     por       mm4, mm2              ; 00000000 | 00rrggbb
-ENDIF ; !GLIDE_PACK_ALPHA
+%ENDIF ; !GLIDE_PACK_ALPHA
 
     punpckldq mm1, mm4              ; RGB(A) | y
     add       fifo, 8               ; fifoPtr += 2*sizeof(FxU32)
@@ -839,15 +826,15 @@ __paramLoop3b:
     nop                             ; filler
     jmp       __paramLoopDoneWBzero3; write buffer empty
 
-ELSE ; !GLIDE_PACK_RGB
+%ELSE ; !GLIDE_PACK_RGB
 
     mov       eax, [dlp]            ; get first offset from the data list
     add       dlp, 4                ; dlp++
 
     test      eax, eax              ; end of list ?
-    jz        __paramLoopDoneWBone3 ; yes, "write buffer" has y data
+    jz        .__paramLoopDoneWBone3 ; yes, "write buffer" has y data
 
-__paramLoop3b:
+.__paramLoop3b:
     movd      mm2, [eax+vertex]     ; get next parameter
     add       fifo, 8               ; fifoPtr += 2*sizeof(FxU32)
 
@@ -858,27 +845,27 @@ __paramLoop3b:
     cmp       eax, 0                ; at end of offset list (offset == 0) ?
   
     WRITE_MM1_FIFO_ALIGNED -8       ; PCI write current param | previous param
-    jz        __paramLoopDoneWBzero3; exit, "write buffer" empty
+    jz        .__paramLoopDoneWBzero3; exit, "write buffer" empty
 
     movd      mm1, [eax+vertex]     ; get next parameter
     mov       eax, [dlp-4]          ; offset = *(dlp + 1)
 
     test      eax, eax              ; at end of offset list (offset == 0) ?
-    jnz       __paramLoop3b         ; nope, copy next parameter
-ENDIF
+    jnz       .__paramLoop3b         ; nope, copy next parameter
+%ENDIF
 
-__paramLoopDoneWBone3:
+.__paramLoopDoneWBone3:
 
 ; "write buffer" contains one DWORD that needs to be flushed
 
     WRITE_MM1LOW_FIFO               ; 
     add       fifo, 4               ;
 
-__paramLoopDoneWBzero3:
+.__paramLoopDoneWBzero3:
 
     ;; Update gc->fifoPtr and gc->fifoRoom
 
-    mov       ecx, [__GlideRoot + trisDrawn]    ; _GlideRoot.stats.trisDrawn
+    mov       ecx, [_GlideRoot + trisDrawn]    ; _GlideRoot.stats.trisDrawn
     mov       eax, fifo                         ; new fifo pointer
     
     mov       ebx, [gc + fifoPtr]               ; old fifo pointer
@@ -887,10 +874,10 @@ __paramLoopDoneWBzero3:
     mov       edx, [gc + fifoRoom]              ; old fifo space available
     inc       ecx                               ; _GlideRoot.stats.trisDrawn++
 
-    mov       ebp, [__GlideRoot + trisProcessed]; _GlideRoot.stats.trisProcessed
+    mov       ebp, [_GlideRoot + trisProcessed]; _GlideRoot.stats.trisProcessed
     sub       eax, ebx                          ; new fifo ptr - old fifo ptr = additional fifo space used
 
-    mov       [__GlideRoot + trisDrawn], ecx    ;
+    mov       [_GlideRoot + trisDrawn], ecx    ;
     sub       edx, eax                          ; new fifo space available
 
     mov       eax, 1h                           ; return value = triangle drawn
@@ -901,7 +888,7 @@ __paramLoopDoneWBzero3:
     inc       ebp                   ; _GlideRoot.stats.trisProcessed++
     pop       esi                   ; restore caller's register variable
 
-    mov       [__GlideRoot + trisProcessed], ebp ;    
+    mov       [_GlideRoot + trisProcessed], ebp ;
     pop       ebx                   ; restore caller's register variable
 
     pop       ebp                   ; restore frame pointer
@@ -909,12 +896,12 @@ __paramLoopDoneWBzero3:
 
     femms                           ; no more AMD3D code, clear FPU/MMX regs
 
-    ret       12                    ; return to caller
+    ret                             ; return to caller
 
 
-IF GLIDE_CULLING
+%IF GLIDE_CULLING
 __cullFail:
-    mov       ebp, [__GlideRoot + trisProcessed]; triangles processed so far
+    mov       ebp, [_GlideRoot + trisProcessed]; triangles processed so far
     xor       eax, eax              ; return value = triangle not drawn
 
     femms                           ; no more AMD3D code, clear FPU/MMX regs
@@ -923,14 +910,14 @@ __cullFail:
     inc       ebp                   ; _GlideRoot.stats.trisProcessed++;    
     pop       esi
 
-    mov       [__GlideRoot + trisProcessed], ebp
+    mov       [_GlideRoot + trisProcessed], ebp
     pop       ebx
     
     pop       ebp                   ; restore frame pointer
     pop       edi
 
-    ret       12
-ENDIF ; GLIDE_CULLING
+    ret
+%ENDIF ; GLIDE_CULLING
 
 ;---------------------------------------------------------------------------
 ;
@@ -938,26 +925,24 @@ ENDIF ; GLIDE_CULLING
 ;
 ;---------------------------------------------------------------------------
 
-endif  ; !defined GL_AMD3D
+%endif  ; !defined GL_AMD3D
 
 ;;--------------------------------------------------------------------------
 ;; start original code
 ;;--------------------------------------------------------------------------
 
-ifndef GL_AMD3D
+%ifndef GL_AMD3D
 
-TITLE   xdraw2.inc
-        
 ; Ugly, but seems to workaround the problem with locally defined
 ; data segment globals not getting relocated properly when using
 ; djgpp.
 
-zArea   TEXTEQU	    <One + 04h>
-gc      TEXTEQU     <esi>       ; points to graphics context
+%define zArea   One + 04h
+%define gc      esi       ; points to graphics context
 
     ;; Prologue stuff
     push    esi
-    mov     gc, [__GlideRoot + curGC]    ;; GR_DCL_GC
+    mov     gc, [_GlideRoot + curGC]    ;; GR_DCL_GC
     
     push    edi
     push    ebx
@@ -966,18 +951,18 @@ gc      TEXTEQU     <esi>       ; points to graphics context
     nop
             
             align 4
-IF GLIDE_CULLING
-fa      TEXTEQU     <eax>       ; vtx a from caller
-fb      TEXTEQU     <ebx>       ; vtx b from caller
-fc      TEXTEQU     <ecx>       ; vtx c from caller
+%IF GLIDE_CULLING
+%define fa      eax       ; vtx a from caller
+%define fb      ebx       ; vtx b from caller
+%define fc      ecx       ; vtx c from caller
 
-cull    TEXTEQU     <edx>
-intArea TEXTEQU     <ebp>       ; temp Y storage
+%define cull    edx
+%define intArea ebp       ; temp Y storage
 
 ; some useful floating load and store macros <ala gmt>
-flds    TEXTEQU <fld  DWORD PTR>
-fsubs   TEXTEQU <fsub DWORD PTR>
-fmuls   TEXTEQU <fmul DWORD PTR>    
+%define flds    fld  DWORD
+%define fsubs   fsub DWORD
+%define fmuls   fmul DWORD
 
     ;; Pre-load the current culling mode before all of the
     ;; floating point area stuff.    
@@ -988,11 +973,11 @@ fmuls   TEXTEQU <fmul DWORD PTR>
     mov     fc, [esp + _vc$]
 
     test    cull, cull    
-    jz      nocull
+    jz      .nocull
 
     shl     cull, 31                    ; culltest << 31    
         
-Area_Computation:    
+;Area_Computation:
 ; 47-3
 ; jmp ret_pop0f
     flds    [fa + X]            ;  xa
@@ -1003,70 +988,66 @@ Area_Computation:
     fsubs   [fc + Y]            ;  |    |    dyBC
     flds    [fa + Y]            ;  |    |    |    ya
     fsubs   [fb + Y]            ;  |    |    |    dyAB
-    fld     st(3)               ;  |    |    |    |    dxAB
-    fmul    st, st(2)           ;  |    |    |    |    t0         t0=dxAB*dyBC
-    fld     st(3)               ;  |    |    |    |    |    dxBC
-    fmul    st, st(2)           ;  |    |    |    |    |    t1    t1=dxBC*dyAB
-    fsubp   st(1),st            ;  |    |    |    |    area
-    fst     zArea               ;  |    |    |    |    area
+    fld     st3                 ;  |    |    |    |    dxAB
+    fmul    st0, st2            ;  |    |    |    |    t0         t0=dxAB*dyBC
+    fld     st3                 ;  |    |    |    |    |    dxBC
+    fmul    st0, st2            ;  |    |    |    |    |    t1    t1=dxBC*dyAB
+    fsubp   st1,st0             ;  |    |    |    |    area
+    fst     dword [zArea]       ;  |    |    |    |    area
 
     ;; Pop temp things from the sw culling off the fp stack
-    fstp    st(0)   ; 4
-    fstp    st(0)   ; 3
-    fstp    st(0)   ; 2
-    fstp    st(0)   ; 1
-    fstp    st(0)   ; 0    
+    fstp    st0   ; 4
+    fstp    st0   ; 3
+    fstp    st0   ; 2
+    fstp    st0   ; 1
+    fstp    st0   ; 0
 
-    mov     intArea, zArea        ; j = *(long *)&area
+    mov     intArea, [zArea]      ; j = *(long *)&area
     xor     eax, eax              ; Clear the return value (0 == culled)
 
     ; Zero Area Triangle Check
     and     intArea, 7fffffffh    ; if ((j & 0x7FFFFFFF) == 0)
-    jz      __triDone
+    jz      .__triDone
 
     ;; Triangle area check vs culling mode
-    mov     intArea, zArea              ; reload area just in case we're culling
+    mov     intArea, [zArea]              ; reload area just in case we're culling
     xor     intArea, cull               ; if (j ^ (culltest << 31))
     
-    jge     __triDone
-nocull: 
-ENDIF ; GLIDE_CULLING    
+    jge     .__triDone
+.nocull:
+%ENDIF ; GLIDE_CULLING
 
             align 4
     ;; Check to make sure that we have enough room for
     ;; the complete triangle packet.
-    mov     eax, [__GlideRoot + curTriSize]
+    mov     eax, [_GlideRoot + curTriSize]
     mov     ebx, [gc + fifoRoom]
 
     add     eax, 4
     cmp     ebx, eax
 
-    jge     __triBegin
+    jge     .__triBegin
     
-    push    @Line
+    push    __LINE__
     push    0h
     
     push    eax
-    call    __FifoMakeRoom
+    call    _FifoMakeRoom
 
     add     esp, 12
 
     ;; Send triangle parameters
     
-dlp     TEXTEQU     <ebx>       ; points to dataList structure
-fifo    TEXTEQU     <ebp>       ; points to next entry in fifo
-vertex  TEXTEQU     <edx>       ; the current vertex
-vOffset TEXTEQU     <ecx>       ; Current vertex offset
+%define dlp     ebx       ; points to dataList structure
+%define fifo    ebp       ; points to next entry in fifo
+%define vertex  edx       ; the current vertex
+%define vOffset ecx       ; Current vertex offset
 
-packCol TEXTEQU     <edi>
-tempVal TEXTEQU     <edi>
-
-GR_FIFO_WRITE   MACRO __addr, __offset, __data
-    mov    [__addr + __offset], __data
-ENDM ; GR_FIFO_WRITE
+%define packCol edi
+%define tempVal edi
 
             align 4    
-__triBegin:
+.__triBegin:
     mov     fifo, [gc + fifoPtr]        ; Fetch Fifo Ptr
     mov     vOffset, 4                  ; Starting vertex
 
@@ -1077,44 +1058,44 @@ __triBegin:
     add     fifo, 4                     ; Advance fifo for hdr & x/y coordinate
 
             align 4    
-__vertexStart:
+.__vertexStart:
     mov     vertex, [esp + STKOFF + vOffset]    ; Current vertex
     add     fifo, 8    
 
     nop                                         ; Avoid p5 agi w/ load of vertex ptr
     nop
     
-    mov     eax, DWORD PTR [vertex]             ; X
+    mov     eax, dword [vertex]             ; X
     lea     dlp, [gc + tsuDataList]             ; Reset the dataList
 
     GR_FIFO_WRITE fifo, -8, eax                 ; PCI write X
-    mov     eax, DWORD PTR [vertex + 4]         ; Y 
+    mov     eax, dword [vertex + 4]         ; Y 
 
     xor     packCol, packCol                    ; Clear packed color
     GR_FIFO_WRITE fifo, -4, eax                 ; PCI write Y
 
-IF GLIDE_PACK_RGB
-    fld     DWORD PTR [vertex + b]              ; B
-    fadd    DWORD PTR __GlideRoot + fBiasLo      ; BC GC
+%IF GLIDE_PACK_RGB
+    fld     dword [vertex + b]              ; B
+    fadd    dword [_GlideRoot + fBiasLo]    ; BC GC
 
-    fld     DWORD PTR [vertex + g]              ; G B
-    fadd    DWORD PTR __GlideRoot + fBiasHi      ; GC B
+    fld     dword [vertex + g]              ; G B
+    fadd    dword [_GlideRoot + fBiasHi]    ; GC B
     
-    fld     DWORD PTR [vertex + r]              ; R GC BC
-    fadd    DWORD PTR __GlideRoot + fBiasHi      ; RC GC BC
+    fld     dword [vertex + r]              ; R GC BC
+    fadd    dword [_GlideRoot + fBiasHi]    ; RC GC BC
 
-    fxch    st(2)                               ; BC GC RC
-    fstp    DWORD PTR bias0                     ; GC RC
+    fxch    st2                             ; BC GC RC
+    fstp    dword [bias0]                   ; GC RC
 
-    fstp    DWORD PTR bias1                     ; RC
-    mov     packCol, DWORD PTR bias0            ; B + bias
+    fstp    dword [bias1]                   ; RC
+    mov     packCol, dword [bias0]          ; B + bias
 
-    fstp    DWORD PTR bias0
-    mov     eax, DWORD PTR bias1                ; G + bias
+    fstp    dword [bias0]
+    mov     eax, dword [bias1]              ; G + bias
     
-IF GLIDE_PACK_ALPHA
-    fld     DWORD PTR [vertex + a]
-    fadd    DWORD PTR __GlideRoot + fBiasHi
+%IF GLIDE_PACK_ALPHA
+    fld     dword [vertex + a]
+    fadd    dword [_GlideRoot + fBiasHi]
 
     and     packCol, 00FFh                      ; B color component
     and     eax, 0000FF00h                      ; G component << 8
@@ -1125,10 +1106,10 @@ IF GLIDE_PACK_ALPHA
     or      packCol, eax                        ; 0000GGBB
     nop
 
-    fstp    DWORD PTR bias1
-    mov     eax, DWORD PTR bias0                ; R + bias
+    fstp    dword [bias1]
+    mov     eax, dword [bias0]                ; R + bias
     
-    mov     esi, DWORD PTR bias1                ; A + bias
+    mov     esi, dword [bias1]                ; A + bias
     and     eax, 0000FF00h                      ; R component << 8
     
     and     esi, 0FFFFFF00h                     ; A component << 8
@@ -1139,56 +1120,56 @@ IF GLIDE_PACK_ALPHA
 
     or      packCol, esi                        ; AARRGGBB
     nop
-ELSE ; !GLIDE_PACK_ALPHA    
+%ELSE ; !GLIDE_PACK_ALPHA
     and     packCol, 00FFh                      ; B color component
     and     eax, 0000FF00h                      ; G component << 8
 
     add     dlp, 4                              ; Next dataList item    
     or      packCol, eax
     
-    mov     eax, DWORD PTR bias0                ; R + bias
+    mov     eax, dword [bias0]                ; R + bias
     and     eax, 0000FF00h                      ; R component << 8
 
     shl     eax, 8                              ; R << 16
     or      packCol, eax                        ; 00RRGGBB
-ENDIF ; !GLIDE_PACK_ALPHA
+%ENDIF ; !GLIDE_PACK_ALPHA
 
     GR_FIFO_WRITE fifo, 0, packCol              ; PCI write packed color value
     add     fifo, 4
-ENDIF ; GLIDE_PACK_RGB
+%ENDIF ; GLIDE_PACK_RGB
 
-__doParams:
-    mov     eax, DWORD PTR [dlp]                ; Get first offset from the data list
+.__doParams:
+    mov     eax, dword [dlp]                ; Get first offset from the data list
     add     dlp, 4                              ; dlp++
     
     cmp     eax, 0                              ; Are we done?
-    je      __nextVertex
+    je      .__nextVertex
 
     ;; Not using align directive here because it sometimes
     ;; introduces an agi for the eax use below.
     nop
     nop
         
-__paramLoop:
-    mov     tempVal, DWORD PTR [eax + vertex]   ; Get the parameter from teh vertex
+.__paramLoop:
+    mov     tempVal, dword [eax + vertex]   ; Get the parameter from teh vertex
     add     fifo, 4                             ; fifoPtr += sizeof(FxU32)
 
-    mov     eax, DWORD PTR [dlp]                ; offset = *(dlp + 1)
+    mov     eax, dword [dlp]                ; offset = *(dlp + 1)
     add     dlp, 4                              ; dlp++
     
     cmp     eax, 0                              ; Are we done?
     GR_FIFO_WRITE fifo, -4, tempVal             ; *fifoPtr = data
     
-    jne     SHORT __paramLoop
+    jne     .__paramLoop
 
                 align 4        
-__nextVertex:   
+.__nextVertex:
     ;; On to the next vertex
     add     vOffset, 4
-    mov     gc, [__GlideRoot + curGC]            ; Reload gc incase we trashed it as a temp
+    mov     gc, [_GlideRoot + curGC]            ; Reload gc incase we trashed it as a temp
 
     cmp     vOffset, 16                         ; Offset of one past last vertex?
-    jne     __vertexStart
+    jne     .__vertexStart
 
     ;; Update gc->fifoPtr and gc->fifoRoom
     mov     eax, fifo
@@ -1197,27 +1178,27 @@ __nextVertex:
     mov     [gc + fifoPtr], fifo
     sub     eax, ebx
 
-    mov     ebx, [__GlideRoot + trisDrawn]               ; _GlideRoot.stats.trisDrawn++;    
+    mov     ebx, [_GlideRoot + trisDrawn]               ; _GlideRoot.stats.trisDrawn++;
     sub     [gc + fifoRoom], eax
 
     add     ebx, 1
-    mov     [__GlideRoot + trisDrawn], ebx
+    mov     [_GlideRoot + trisDrawn], ebx
 
     ;; return 1 (triangle drawn)    
     mov     eax, 1h
 
-__triDone:    
+.__triDone:
     ;; Restore trashed registers
-    mov     esi, [__GlideRoot + trisProcessed]
+    mov     esi, [_GlideRoot + trisProcessed]
     pop     ebp
         
     add     esi, 1    ; _GlideRoot.stats.trisProcessed++;    
     pop     ebx
     
     pop     edi
-    mov     [__GlideRoot + trisProcessed], esi
+    mov     [_GlideRoot + trisProcessed], esi
         
     pop     esi
-    ret     12
+    ret
 
-endif ; !GL_AMD3D
+%endif ; !GL_AMD3D
diff --git a/glide3x/cvg/glide3/src/xdraw3.asm b/glide3x/cvg/glide3/src/xdraw3.asm
index eb19950..3dd66e5 100644
--- a/glide3x/cvg/glide3/src/xdraw3.asm
+++ b/glide3x/cvg/glide3/src/xdraw3.asm
@@ -16,65 +16,58 @@
 ;; 
 ;; COPYRIGHT 3DFX INTERACTIVE, INC. 1999, ALL RIGHTS RESERVED
 
+%include "xos.inc"
+
 ;;--------------------------------------------------------------------------
 ;; start AMD3D version
 ;;--------------------------------------------------------------------------
 
-ifdef GL_AMD3D
-
-TITLE   xdraw3.asm
-.586P
-.MMX
-.K3D
+%ifdef GL_AMD3D
 
 ;;; include listing.inc
-INCLUDE fxgasm.h
+%INCLUDE "fxgasm.h"
 
-EXTRN   __GlideRoot:DWORD
-EXTRN   __FifoMakeRoom:NEAR
+extrn   _GlideRoot
+extrn   _FifoMakeRoom
 
-CONST   SEGMENT
+segment		CONST
         ALIGN 8
-_F256_F256      DQ    04380000043800000h ; 256 | 256
-CONST   ENDS
+_F256_F256      DD    043800000h, 43800000h ; 256 | 256
 
-_DATA   SEGMENT
+segment		DATA
         ALIGN   8
-btab            DD    8 DUP(0)
-atab            DD    8 DUP(0)
+btab            DD    0, 0, 0, 0, 0, 0, 0, 0
+atab            DD    0, 0, 0, 0, 0, 0, 0, 0
 vSize           DD    0
 strideinbytes   DD    0
 vertices        DD    0
-_DATA    ENDS
 
-_TEXT         SEGMENT PAGE PUBLIC USE32 'CODE'
-              ASSUME DS: FLAT, SS: FLAT
+segment		TEXT
 
-_pktype   = 20
-_type     = 24
-_mode     = 28
-_count    = 32
-_pointers = 36
+_pktype   equ 20
+_type     equ 24
+_mode     equ 28
+%define _count    32
+%define _pointers 36
 
-gc            TEXTEQU  <edi>             ; points to graphics context
-fifo          TEXTEQU  <ecx>             ; points to next entry in fifo
-dlp           TEXTEQU  <ebp>             ; points to dataList structure
-vertexCount   TEXTEQU  <esi>             ; Current vertex counter in the packet
-vertexPtr     TEXTEQU  <ebx>             ; Current vertex pointer (in deref mode)
-vertex        TEXTEQU  <ebx>             ; Current vertex (in non-deref mode)
-dlpStart      TEXTEQU  <edx>             ; Pointer to start of offset list
+%define gc            edi             ; points to graphics context
+%define fifo          ecx             ; points to next entry in fifo
+%define dlp           ebp             ; points to dataList structure
+%define vertexCount   esi             ; Current vertex counter in the packet
+%define vertexPtr     ebx             ; Current vertex pointer (in deref mode)
+%define vertex        ebx             ; Current vertex (in non-deref mode)
+%define dlpStart      edx             ; Pointer to start of offset list
 
-X TEXTEQU     <0>
-Y TEXTEQU     <4>
+%define X 0
+%define Y 4
 
                   ALIGN  32
 
-    PUBLIC  __grDrawVertexList_3DNow_Window@20
-__grDrawVertexList_3DNow_Window@20 PROC NEAR
+proc _grDrawVertexList_3DNow_Window, 20
 ; 132  : {
 
     push      edi                        ; save caller's register variable
-    mov       gc, [__GlideRoot + curGC]  ; get current graphics context
+    mov       gc, [_GlideRoot + curGC]  ; get current graphics context
 
     push      esi                        ; save caller's register variable
     mov       vertexCount, [esp+_count-8]; number of vertices in strip/fan
@@ -86,7 +79,7 @@ __grDrawVertexList_3DNow_Window@20 PROC NEAR
                                          ; get current vertex (non-deref mode)
     test      vertexCount, vertexCount   ; number of vertices <= 0 ?
     
-    jle       strip_done                 ; yup, the strip/fan is done
+    jle       .strip_done                 ; yup, the strip/fan is done
   
 ;;;     vSize = gc->state.vData.vSize
 ;;;     if (stride == 0)
@@ -110,7 +103,7 @@ __grDrawVertexList_3DNow_Window@20 PROC NEAR
     test      edx, edx                   ; mode 0 (array of vertices) ?
     mov       edx, [gc + vertexStride]   ; get stride in DWORDs
     
-    jnz       deref_mode                 ; nope, it's mode 1 (array of pointers to vertices)
+    jnz       .deref_mode                 ; nope, it's mode 1 (array of pointers to vertices)
 
     femms                                ; we'll use MMX; clear MMX/3DX state      
 
@@ -129,7 +122,7 @@ __grDrawVertexList_3DNow_Window@20 PROC NEAR
 ;;;         TRI_STRIP_BEGIN(type, vcount, vSize, pktype);
 
 
-win_coords_loop_ND:
+.win_coords_loop_ND:
 
     sub       vertexCount, 15            ; vertexCount >= 15 ? CF=0 : CF=1
     mov       ecx, [gc + vertexSize]     ; bytes of data for each vertex 
@@ -145,18 +138,18 @@ win_coords_loop_ND:
     add       ecx, 4                     ; add header size ==> total packet size
 
     cmp       eax, ecx                   ; fifo space avail >= packet size ?
-    jge       win_strip_begin_ND         ; yup, start writing strip data
+    jge       .win_strip_begin_ND         ; yup, start writing strip data
 
-    push      @Line                      ; line number inside this function
+    push      __LINE__                      ; line number inside this function
     push      0h                         ; pointer to function name = NULL
 
     push      ecx                        ; fifo space needed
-    call      __FifoMakeRoom             ; note: updates fifoPtr
+    call      _FifoMakeRoom             ; note: updates fifoPtr
 
     lea       esp, [esp+12]              ; remove 3 DWORD arguments from stack
 
 
-win_strip_begin_ND:
+.win_strip_begin_ND:
 
 ;;;     Setup packet header
 ;;;
@@ -179,7 +172,7 @@ win_strip_begin_ND:
     lea       dlpStart, [gc+tsuDataList] ; pointer to start of offset list
 
     test      fifo, ebp                  ; fifoPtr QWORD aligned ?
-    jz        fifo_aligned_ND            ; yup
+    jz        .fifo_aligned_ND            ; yup
 
     mov       [fifo], eax                ; PCI write packet type
     add       fifo, 4                    ; fifo pointer now QWORD aligned
@@ -197,7 +190,7 @@ win_strip_begin_ND:
 ;;;       TRI_SETF(FARRAY(vPtr, 4));
 ;;;       i = gc->tsuDataList[dataElem];
 
-win_vertex_loop_ND_WB0:                  ; nothing in "write buffer"
+.win_vertex_loop_ND_WB0:                  ; nothing in "write buffer"
 
     mov       eax, [dlpStart]            ; get first offset from offset list
     lea       dlp, [dlpStart+4]          ; point to start of offset list
@@ -209,7 +202,7 @@ win_vertex_loop_ND_WB0:                  ; nothing in "write buffer"
     test      eax, eax                   ; if offset == 0, end of list
 
     movq      [fifo-8], mm1              ; PCI write x, y
-    jz        win_datalist_end_ND_WB0    ; no more vertex data, nothing in "write buffer" 
+    jz        .win_datalist_end_ND_WB0    ; no more vertex data, nothing in "write buffer"
 
 ;;;       while (i != GR_DLIST_END) {
 ;;;         TRI_SETF(FARRAY(vPtr, i));
@@ -217,13 +210,13 @@ win_vertex_loop_ND_WB0:                  ; nothing in "write buffer"
 ;;;         i = gc->tsuDataList[dataElem];
 ;;;       }
 
-win_datalist_loop_ND_WB0:                ; nothing in "write buffer"
+.win_datalist_loop_ND_WB0:                ; nothing in "write buffer"
 
     movd      mm1, [vertex + eax]        ; get next parameter
     mov       eax, [dlp]                 ; get next offset from offset list
 
     test      eax, eax                   ; at end of offset list (offset == 0) ?
-    jz        win_datalist_end_ND_WB1    ; exit, write buffer contains one DWORD
+    jz        .win_datalist_end_ND_WB1    ; exit, write buffer contains one DWORD
 
     movd      mm2, [vertex + eax]        ; get next parameter
     add       dlp, 8                     ; dlp++
@@ -235,17 +228,17 @@ win_datalist_loop_ND_WB0:                ; nothing in "write buffer"
     punpckldq mm1, mm2                   ; current param | previous param
 
     movq      [fifo-8], mm1              ; PCI write current param | previous param
-    jnz       win_datalist_loop_ND_WB0   ; nope, copy next parameter
+    jnz       .win_datalist_loop_ND_WB0   ; nope, copy next parameter
 
-win_datalist_end_ND_WB0:
+.win_datalist_end_ND_WB0:
 
     mov       eax, [strideinbytes]       ; get offset to next vertex
     sub       vertexCount, 1             ; another vertex done. Any left?
 
     lea       vertex, [vertex + eax]     ; points to next vertex
-    jnz       win_vertex_loop_ND_WB0     ; yup, output next vertex
+    jnz       .win_vertex_loop_ND_WB0     ; yup, output next vertex
 
-win_vertex_end_ND_WB0:
+.win_vertex_end_ND_WB0:
 
 ;;;       TRI_END;
 ;;;     Prepare for the next packet (if the strip size is longer than 15)
@@ -273,7 +266,7 @@ win_vertex_end_ND_WB0:
     test      vertexCount, vertexCount   ; any vertices left to process ?
 
     nop                                  ; filler
-    jg        win_coords_loop_ND         ; loop if number of vertices to process >= 0
+    jg        .win_coords_loop_ND         ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state
 
@@ -286,11 +279,11 @@ win_vertex_end_ND_WB0:
     ret       20                         ; return, pop 5 DWORD parameters off stack
 
 
-fifo_aligned_ND:
+.fifo_aligned_ND:
 
     movd      mm1, eax                   ; move header into "write buffer"
 
-win_vertex_loop_ND_WB1:                  ; one DWORD in "write buffer"
+.win_vertex_loop_ND_WB1:                  ; one DWORD in "write buffer"
 
     movd      mm2, [vertex + X]          ; 0 | x of vertex
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxU32)
@@ -305,7 +298,7 @@ win_vertex_loop_ND_WB1:                  ; one DWORD in "write buffer"
     movd      mm1, [vertex + Y]          ; 0 | y of vertex
 
     cmp       eax, 0                     ; offset == 0 (list empty) ?
-    jz        win_datalist_end_ND_WB1    ; yup, no more vertex data, one DWORD in "write buffer"
+    jz        .win_datalist_end_ND_WB1    ; yup, no more vertex data, one DWORD in "write buffer"
 
 ;;;       while (i != GR_DLIST_END) {
 ;;;         TRI_SETF(FARRAY(vPtr, i));
@@ -313,7 +306,7 @@ win_vertex_loop_ND_WB1:                  ; one DWORD in "write buffer"
 ;;;         i = gc->tsuDataList[dataElem];
 ;;;       }
 
-win_datalist_loop_ND_WB1:                ; one DWORD in "write buffer" 
+.win_datalist_loop_ND_WB1:                ; one DWORD in "write buffer"
 
     movd      mm2, [vertex + eax]        ; get next parameter
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxU32)
@@ -325,23 +318,23 @@ win_datalist_loop_ND_WB1:                ; one DWORD in "write buffer"
     cmp       eax, 0                     ; at end of offset list (offset == 0) ?
 
     movq      [fifo-8], mm1              ; PCI write current param | previous param
-    jz        win_datalist_end_ND_WB0    ; yes, exit, "write buffer" empty
+    jz        .win_datalist_end_ND_WB0    ; yes, exit, "write buffer" empty
 
     movd      mm1, [vertex+eax]          ; get next parameter
     mov       eax, [dlp-4]               ; get next offset from offset list
 
     test      eax, eax                   ; at end of offset list (offset == 0) ?
-    jnz       win_datalist_loop_ND_WB1   ; nope, copy next parameter
+    jnz       .win_datalist_loop_ND_WB1   ; nope, copy next parameter
 
-win_datalist_end_ND_WB1:
+.win_datalist_end_ND_WB1:
 
     mov       eax, [strideinbytes]       ; get offset to next vertex
     sub       vertexCount, 1             ; another vertex done. Any left?
 
     lea       vertex, [vertex + eax]     ; points to next vertex
-    jnz       win_vertex_loop_ND_WB1     ; yup, output next vertex
+    jnz       .win_vertex_loop_ND_WB1     ; yup, output next vertex
 
-win_vertex_end_ND_WB1:
+.win_vertex_end_ND_WB1:
 
     movd      [fifo], mm1                ; flush "write buffer"
     add       fifo, 4                    ; fifoPtr += sizeof(FxU32)
@@ -372,7 +365,7 @@ win_vertex_end_ND_WB1:
     test      vertexCount, vertexCount   ; any vertices left to process ?
 
     nop                                  ; filler
-    jg        win_coords_loop_ND         ; loop if number of vertices to process >= 0
+    jg        .win_coords_loop_ND         ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state
 
@@ -386,13 +379,13 @@ win_vertex_end_ND_WB1:
 
     db        08dh,064h,024h,000h        ; filler (not reachable)
     
-deref_mode:
+.deref_mode:
 
     femms                                ; we'll use MMX; clear FPU/MMX state
 
     prefetch  [vertexPtr]                ; pre-load first group of pointers
 
-win_coords_loop_D:
+.win_coords_loop_D:
 
     sub       vertexCount, 15            ; vertexCount >= 15 ? CF=0 : CF=1
     mov       ecx, [gc + vertexSize]     ; bytes of data for each vertex 
@@ -408,18 +401,18 @@ win_coords_loop_D:
     add       ecx, 4                     ; add header size ==> total packet size
 
     cmp       eax, ecx                   ; fifo space avail >= packet size ?
-    jge       win_strip_begin_D          ; yup, start writing strip data
+    jge       .win_strip_begin_D          ; yup, start writing strip data
 
-    push      @Line                      ; line number inside this function
+    push      __LINE__                   ; line number inside this function
     push      0h                         ; pointer to function name = NULL
 
     push      ecx                        ; fifo space needed
-    call      __FifoMakeRoom             ; note: updates fifoPtr
+    call      _FifoMakeRoom             ; note: updates fifoPtr
 
     add       esp, 12                    ; remove 3 DWORD arguments from stack
     nop                                  ; filler
 
-win_strip_begin_D:
+.win_strip_begin_D:
 
 ;;;     Setup packet header
 ;;;
@@ -442,7 +435,7 @@ win_strip_begin_D:
     lea       dlpStart, [gc+tsuDataList] ; pointer to start of offset list
 
     test      fifo, ebp                  ; fifoPtr QWORD aligned ?
-    jz        fifo_aligned_D             ; yup
+    jz        .fifo_aligned_D             ; yup
 
     mov       [fifo], eax                ; PCI write packet type
     add       fifo, 4                    ; fifo pointer now QWORD aligned
@@ -461,7 +454,7 @@ win_strip_begin_D:
 ;;;       i = gc->tsuDataList[dataElem];
 
 
-win_vertex_loop_D_WB0:                   ; nothing in "write buffer"
+.win_vertex_loop_D_WB0:                   ; nothing in "write buffer"
 
     mov       edx, [vertexPtr]           ; dereference pointer, edx points to vertex
     add       vertexPtr, 4               ; next pointer
@@ -476,7 +469,7 @@ win_vertex_loop_D_WB0:                   ; nothing in "write buffer"
     movq      [fifo-8], mm1              ; PCI write x, y
 
     cmp       eax, 0                     ; if offset == 0, end of offset list
-    je        win_datalist_end_D_WB0     ; no more vertex data, nothing in "write buffer" 
+    je        .win_datalist_end_D_WB0     ; no more vertex data, nothing in "write buffer"
 
 ;;;       while (i != GR_DLIST_END) {
 ;;;         TRI_SETF(FARRAY(vPtr, i));
@@ -484,13 +477,13 @@ win_vertex_loop_D_WB0:                   ; nothing in "write buffer"
 ;;;         i = gc->tsuDataList[dataElem];
 ;;;       }
 
-win_datalist_loop_D_WB0:                 ; nothing in "write buffer"
+.win_datalist_loop_D_WB0:                 ; nothing in "write buffer"
 
     movd      mm1, [edx + eax]           ; get next parameter
     mov       eax, [dlp]                 ; get next offset from offset list
 
     cmp       eax, 0                     ; at end of offset list (offset == 0) ?
-    jz        win_datalist_end_D_WB1     ; exit, write buffer contains one DWORD
+    jz        .win_datalist_end_D_WB1     ; exit, write buffer contains one DWORD
 
     add       dlp, 8                     ; dlp++
     movd      mm2, [edx + eax]           ; get next parameter
@@ -502,14 +495,14 @@ win_datalist_loop_D_WB0:                 ; nothing in "write buffer"
     test      eax, eax                   ; at end of offset list (offset == 0) ?
 
     movq      [fifo-8], mm1              ; PCI write current param | previous param
-    jnz       win_datalist_loop_D_WB0    ; nope, copy next parameter
+    jnz       .win_datalist_loop_D_WB0    ; nope, copy next parameter
 
-win_datalist_end_D_WB0:
+.win_datalist_end_D_WB0:
 
     dec       vertexCount                ; another vertex done. Any left?
-    jnz       win_vertex_loop_D_WB0      ; yup, output next vertex
+    jnz       .win_vertex_loop_D_WB0      ; yup, output next vertex
 
-win_vertex_end_D_WB0:
+.win_vertex_end_D_WB0:
 
 ;;;       TRI_END;
 ;;;     Prepare for the next packet (if the strip size is longer than 15)
@@ -537,7 +530,7 @@ win_vertex_end_D_WB0:
     test      vertexCount, vertexCount   ; any vertices left to process ?
 
     nop                                  ; filler
-    jg        win_coords_loop_D          ; loop if number of vertices to process >= 0
+    jg        .win_coords_loop_D          ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state
 
@@ -550,11 +543,11 @@ win_vertex_end_D_WB0:
     ret       20                         ; return, pop 5 DWORD parameters off stack
     mov       esp, esp                   ; filler (unreachable)
 
-fifo_aligned_D:
+.fifo_aligned_D:
 
     movd      mm1, eax                   ; move header into "write buffer"
 
-win_vertex_loop_D_WB1:                   ; one DWORD in "write buffer"
+.win_vertex_loop_D_WB1:                   ; one DWORD in "write buffer"
 
     mov       edx, [vertexPtr]           ; dereference pointer, edx points to vertex
     add       vertexPtr, 4               ; next pointer
@@ -572,7 +565,7 @@ win_vertex_loop_D_WB1:                   ; one DWORD in "write buffer"
     movd      mm1, [edx + Y]             ; 0 | y of vertex
 
     cmp       eax, 0                     ; offset == 0 (list empty) ?
-    je        win_datalist_end_D_WB1     ; yup, no more vertex data, one DWORD in "write buffer"
+    je        .win_datalist_end_D_WB1     ; yup, no more vertex data, one DWORD in "write buffer"
 
 ;;;       while (i != GR_DLIST_END) {
 ;;;         TRI_SETF(FARRAY(vPtr, i));
@@ -580,7 +573,7 @@ win_vertex_loop_D_WB1:                   ; one DWORD in "write buffer"
 ;;;         i = gc->tsuDataList[dataElem];
 ;;;       }
 
-win_datalist_loop_D_WB1:                ; one DWORD in "write buffer" = MM1
+.win_datalist_loop_D_WB1:                ; one DWORD in "write buffer" = MM1
 
     movd      mm2, [edx + eax]          ; get next parameter
     add       fifo, 8                   ; fifoPtr += 2*sizeof(FxU32)
@@ -592,20 +585,20 @@ win_datalist_loop_D_WB1:                ; one DWORD in "write buffer" = MM1
     test      eax, eax                  ; at end of offset list (offset == 0) ?
 
     movq      [fifo-8], mm1             ; PCI write current param | previous param
-    jz        win_datalist_end_D_WB0    ; yes, exit, "write buffer" empty
+    jz        .win_datalist_end_D_WB0    ; yes, exit, "write buffer" empty
 
     movd      mm1, [edx + eax]          ; get next parameter
     mov       eax, [dlp-4]              ; get next offset from offset list
 
     test      eax, eax                  ; at end of offset list (offset == 0) ?
-    jnz       win_datalist_loop_D_WB1   ; nope, copy next parameter
+    jnz       .win_datalist_loop_D_WB1   ; nope, copy next parameter
 
-win_datalist_end_D_WB1:
+.win_datalist_end_D_WB1:
 
     dec       vertexCount               ; another vertex done. Any left?
-    jnz       win_vertex_loop_D_WB1     ; yup, output next vertex
+    jnz       .win_vertex_loop_D_WB1     ; yup, output next vertex
 
-win_vertex_end_D_WB1:
+.win_vertex_end_D_WB1:
 
     movd      [fifo], mm1               ; flush "write buffer"
     add       fifo, 4                   ; fifoPtr++
@@ -636,11 +629,11 @@ win_vertex_end_D_WB1:
     cmp       vertexCount, 0             ; any vertices left to process ?
 
     mov       [esp + _count], vertexCount; remaining number of vertices to process 
-    jg        win_coords_loop_D          ; loop if number of vertices to process >= 0
+    jg        .win_coords_loop_D          ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state
 
-strip_done: 
+.strip_done:
     pop       ebp                        ; restore frame pointer
     pop       ebx                        ; restore caller's register variable
 
@@ -649,14 +642,13 @@ strip_done:
 
     ret       20                         ; return, pop 5 DWORD parameters off stack
 
-__grDrawVertexList_3DNow_Window@20 ENDP
+endp
 
-    PUBLIC  __grDrawVertexList_3DNow_Clip@20
-__grDrawVertexList_3DNow_Clip@20 PROC NEAR
+proc _grDrawVertexList_3DNow_Clip, 20
 ; 132  : {
 
     push      edi                        ; save caller's register variable
-    mov       gc, [__GlideRoot + curGC]  ; get current graphics context
+    mov       gc, [_GlideRoot + curGC]  ; get current graphics context
 
     push      esi                        ; save caller's register variable
     mov       vertexCount, [esp+_count-8]; number of vertices in strip/fan
@@ -668,7 +660,7 @@ __grDrawVertexList_3DNow_Clip@20 PROC NEAR
                                          ; get current vertex (non-deref mode)
     test      vertexCount, vertexCount   ; number of vertices <= 0 ?
     
-    jle       strip_done                 ; yup, the strip/fan is done
+    jle       .strip_done                 ; yup, the strip/fan is done
   
 ;;;     vSize = gc->state.vData.vSize
 ;;;     if (stride == 0)
@@ -692,20 +684,20 @@ __grDrawVertexList_3DNow_Clip@20 PROC NEAR
     test      edx, edx                   ; mode 0 (array of vertices) ?
     mov       edx, [gc + vertexStride]   ; get stride in DWORDs
 
-    movd      mm6, [__GlideRoot+pool_f255]; GlideRoot.pool.f255     
-    mov       [strideinbytes], 4         ; array of pointers    
+    movd      mm6, [_GlideRoot+pool_f255]; GlideRoot.pool.f255
+    mov       dword [strideinbytes], 4         ; array of pointers
         
-    jnz       clip_coords_begin          ; nope, it's mode 1
+    jnz       .clip_coords_begin          ; nope, it's mode 1
 
-clip_coordinates_ND:
+.clip_coordinates_ND:
 
     shl       edx, 2                     ; stride in bytes
     mov       [strideinbytes], edx       ; save off stride (in bytes)
 
     align   32
-clip_coords_begin:
+.clip_coords_begin:
 
-dataElem      textequ <ebp>              ; number of vertex components processed    
+%define dataElem      ebp              ; number of vertex components processed
 
 ;;;   {
 ;;;     float oow;
@@ -727,17 +719,17 @@ dataElem      textequ <ebp>              ; number of vertex components processed
     nop                                  ; filler
 
     cmp       eax, ecx                   ; fifo space avail >= packet size ?
-    jge       clip_strip_begin           ; yup, start writing strip data
+    jge       .clip_strip_begin           ; yup, start writing strip data
 
-    push      @Line                      ; line number inside this function
+    push      __LINE__                   ; line number inside this function
     push      0h                         ; pointer to function name = NULL
 
     push      ecx                        ; fifo space needed
-    call      __FifoMakeRoom             ; note: updates fifoPtr
+    call      _FifoMakeRoom             ; note: updates fifoPtr
 
     add       esp, 12                    ; remove 3 DWORD arguments from stack
 
-clip_strip_begin:
+.clip_strip_begin:
 
 ;;;     TRI_STRIP_BEGIN(type, vcount, vSize, pktype)
 
@@ -763,7 +755,7 @@ clip_strip_begin:
 ;;;       float *vPtr
 ;;;       vPtr = pointers
   
-clip_for_begin:
+.clip_for_begin:
 
 ;;;       if (mode)
 ;;;         vPtr = *(float **)vPtr
@@ -775,12 +767,12 @@ clip_for_begin:
     test      eax, eax                   ; deref mode ?
 
     mov       eax, [gc+wInfo_offset]     ; get offset of W into vertex struct
-    jz        clip_noderef               ; yup, no-deref mode
+    jz        .clip_noderef               ; yup, no-deref mode
 
     mov       edx, [vertexPtr]           ; vertex = *vertexPtr
     lea       esp, [esp]                 ; filler
 
-clip_noderef:
+.clip_noderef:
 
 ;;;       oow = 1.0f / FARRAY(vPtr, gc->state.vData.wInfo.offset)
 
@@ -821,13 +813,13 @@ clip_noderef:
 ;;;       TRI_VP_SETFS(vPtr, oow);
 
     movq      [fifo-8], mm2              ; PCI write transformed x, y
-    jz        clip_setup_ooz             ; nope, no color at all needed
+    jz        .clip_setup_ooz             ; nope, no color at all needed
   
-    cmp       DWORD PTR [gc+colorType], 0; gc->state.vData.colorType == GR_FLOAT ?
-    jne       clip_setup_pargb           ; nope, packed ARGB format
+    cmp       dword [gc+colorType], 0; gc->state.vData.colorType == GR_FLOAT ?
+    jne       .clip_setup_pargb           ; nope, packed ARGB format
   
     test      esi, 1                     ; STATE_REQUIRES_IT_DRGB ?
-    jz        clip_setup_a               ; no, but definitely A
+    jz        .clip_setup_a               ; no, but definitely A
 
     movd      mm2, [edx + eax]           ; 0 | r
     mov       eax, [gc+tsuDataList+4]    ; offset of g part of vertex data
@@ -851,9 +843,9 @@ clip_noderef:
     lea       fifo, [fifo+12]            ; fifoPtr += 3*sizeof(FxFloat)
 
     movd      [fifo-4], mm2              ; PCI write b*255
-    jz        clip_setup_ooz             ; nope, no alpha, proceeed with ooz
+    jz        .clip_setup_ooz             ; nope, no alpha, proceeed with ooz
 
-clip_setup_a:
+.clip_setup_a:
     movd      mm2, [eax+edx]             ; 0 | a
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
 
@@ -864,11 +856,11 @@ clip_setup_a:
     mov       eax, [gc+dataElem+tsuDataList]; offset of next part of vertex data
 
     movd      [fifo-4], mm2              ; PCI write a*255
-    jmp       clip_setup_ooz             ; check whether we need to push out z
+    jmp       .clip_setup_ooz             ; check whether we need to push out z
 
     ALIGN     32
 
-clip_setup_pargb:
+.clip_setup_pargb:
     movd      mm2, [eax+edx]             ; get packed ARGB data
     add       fifo, 4                    ; fifoPtr += sizeof(FxU32)
 
@@ -877,10 +869,10 @@ clip_setup_pargb:
 
     movd      [fifo-4], mm2              ; PCI write packed ARGB
 
-clip_setup_ooz:
+.clip_setup_ooz:
   
     test      esi, 4                     ; STATE_REQUIRES_OOZ ?
-    jz        clip_setup_qow             ; nope
+    jz        .clip_setup_qow             ; nope
 
     movd      mm2, [eax+edx]             ; 0 | z component of vertex
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
@@ -897,12 +889,12 @@ clip_setup_ooz:
     pfadd     mm2, mm4                   ; 0 | TRI_SETF(FARRAY(_s, i)*_oow*gc->state.Viewport.hdepth+gc->state.Viewport.oz
     movd      [fifo-4], mm2              ; PCI write transformed Z
 
-clip_setup_qow:
+.clip_setup_qow:
     test      esi, 8                     ; STATE_REQUIRES_OOW_FBI ?
-    jz        clip_setup_qow0            ; nope
+    jz        .clip_setup_qow0            ; nope
 
-    cmp       DWORD PTR [gc+qInfo_mode],0; does vertex have Q component ?
-    je        clip_setup_oow             ; nope, not Q but W
+    cmp       dword [gc+qInfo_mode],0; does vertex have Q component ?
+    je        .clip_setup_oow             ; nope, not Q but W
 
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
     mov       eax, [gc+qInfo_offset]     ; offset of Q component of vertex
@@ -914,23 +906,23 @@ clip_setup_qow:
     pfmul     mm2, mm0                   ; q*oow
 
     movd      [fifo-4], mm2              ; PCI write transformed Q
-    jmp       clip_setup_qow0            ; continue with q0
+    jmp       .clip_setup_qow0            ; continue with q0
 
     ALIGN     32
 
-clip_setup_oow:
+.clip_setup_oow:
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat) 
     add       dataElem, 4                ; dataElem++
 
     movd      [fifo-4], mm0              ; PCI write oow
     mov       eax,[gc+dataElem+tsuDataList]; pointer to next vertex component
 
-clip_setup_qow0:
+.clip_setup_qow0:
     test      esi, 16                    ; STATE_REQUIRES_W_TMU0 ?
-    jz        clip_setup_stow0           ; nope 
+    jz        .clip_setup_stow0           ; nope
 
-    cmp       DWORD PTR [gc+q0Info_mode],0; does vertex have Q component ?
-    je        clip_setup_oow0            ; nope, not Q but W
+    cmp       dword [gc+q0Info_mode],0; does vertex have Q component ?
+    je        .clip_setup_oow0            ; nope, not Q but W
 
     mov       eax, [gc+q0Info_offset]    ; offset of Q component of vertex
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
@@ -942,21 +934,21 @@ clip_setup_qow0:
     pfmul     mm2, mm0                   ; q0*oow
 
     movd      [fifo-4], mm2              ; PCI write transformed q0
-    jmp       clip_setup_stow0           ; continue with stow0
+    jmp       .clip_setup_stow0           ; continue with stow0
 
     ALIGN     32
 
-clip_setup_oow0:
+.clip_setup_oow0:
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat) 
     add       dataElem, 4                ; dataElem++
 
     movd      [fifo-4], mm0              ; PCI write oow
     mov       eax,[gc+dataElem+tsuDataList]; pointer to next vertex component
 
-clip_setup_stow0:
+.clip_setup_stow0:
 
     test      esi, 32                    ; STATE_REQUIRES_ST_TMU0 ?
-    jz        clip_setup_qow1            ; nope
+    jz        .clip_setup_qow1            ; nope
 
     movq      mm7, [gc + tmu0_s_scale]   ; state.tmu_config[0].t_scale | state.tmu_config[0].s_scale
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
@@ -975,12 +967,12 @@ clip_setup_stow0:
     movq      [fifo-8], mm2              ; PCI write param2*oow*tmu0_t_scale | param1*oow*tmu0_s_scale 
     mov       eax, [gc+dataElem+tsuDataList]; pointer to next vertex component
 
-clip_setup_qow1:
+.clip_setup_qow1:
     test      esi, 64                    ; STATE_REQUIRES_W_TMU1 ?
-    jz        clip_setup_stow1           ; nope
+    jz        .clip_setup_stow1           ; nope
 
-    cmp       DWORD PTR [gc+q1Info_mode],0; does vertex have Q component ?
-    je        clip_setup_oow1            ; nope, not Q but W
+    cmp       dword [gc+q1Info_mode],0; does vertex have Q component ?
+    je        .clip_setup_oow1            ; nope, not Q but W
 
     mov       eax, [gc+q1Info_offset]    ; offset of Q component of vertex
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
@@ -992,24 +984,24 @@ clip_setup_qow1:
     pfmul     mm2, mm0                   ; q1*oow
 
     movd      [fifo-4], mm2              ; PCI write transformed q1
-    jmp       clip_setup_stow1           ; continue with stow1
+    jmp       .clip_setup_stow1           ; continue with stow1
 
     ALIGN     32
 
-clip_setup_oow1:
+.clip_setup_oow1:
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat) 
     add       dataElem, 4                ; dataElem++
 
     movd      [fifo-4], mm0              ; PCI write oow
     mov       eax,[gc+dataElem+tsuDataList]; pointer to next vertex component
 
-clip_setup_stow1:
+.clip_setup_stow1:
 
     test      esi, 128                   ; STATE_REQUIRES_ST_TMU1 ?
     mov       vertexCount, [vertices]    ; get number of vertices
 
     movq      mm7, [gc + tmu1_s_scale]   ; state.tmu_config[1].t_scale | state.tmu_config[1].s_scale
-    jz        clip_setup_end             ; nope
+    jz        .clip_setup_end             ; nope
 
     movd      mm2, [edx+eax]             ; param1
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
@@ -1023,13 +1015,13 @@ clip_setup_stow1:
     pfmul     mm2, mm7                   ; param2*oow*state.tmu_config[1].t_scale | param1*oow*state.tmu_config[1].s_scale
     movq      [fifo-8], mm2              ; PCI write param2*oow*state.tmu_config[1].t_scale | param1*oow*state.tmu_config[1].s_scale
 
-clip_setup_end:
+.clip_setup_end:
 
 ; 206  :       for (k = 0; k < vcount; k++) {
 
     dec       vertexCount                ; vcount--
-    jnz       clip_for_begin             ; until 
-clip_for_end:
+    jnz       .clip_for_begin             ; until
+.clip_for_end:
 
 ; 221  :       }
 ; 222  :       TRI_END;
@@ -1052,12 +1044,12 @@ clip_for_end:
     mov       [esp + _count], vertexCount; remaining number of vertices to process 
     cmp       vertexCount, 0             ; any vertices left to process ?
 
-    mov       DWORD PTR [esp+_pktype], 16; pktype = SSTCP_PKT3_DDDDDD (strip continuation)
-    jg        clip_coords_begin          ; loop if number of vertices to process >= 0
+    mov       dword [esp+_pktype], 16; pktype = SSTCP_PKT3_DDDDDD (strip continuation)
+    jg        .clip_coords_begin          ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state    
 
-strip_done:
+.strip_done:
 ;;;    }
 ;;;  #undef FN_NAME
 ;;;  } /* _grDrawVertexList */
@@ -1070,26 +1062,25 @@ strip_done:
 
     ret       20                         ; return, pop 5 DWORD parameters off stack
 
-__grDrawVertexList_3DNow_Clip@20 ENDP    
+endp
 
     ALIGN     32
 
-_a$      TEXTEQU <20>
-_b$      TEXTEQU <24>
+%define _a$      20
+%define _b$      24
 
-gc       TEXTEQU <esi>
-vb       TEXTEQU <edi>
-va       TEXTEQU <ebx>
-i        TEXTEQU <ebp>
-j        TEXTEQU <edx>
-dlp      TEXTEQU <edx>
-fifo     TEXTEQU <ecx>
-dlpStart TEXTEQU <ebp>
-ADX      TEXTEQU <__GlideRoot+pool_fTemp1>
-ADY      TEXTEQU <__GlideRoot+pool_fTemp2>
+%define gc       esi
+%define vb       edi
+%define va       ebx
+%define i        ebp
+%define j        edx
+%define dlp      edx
+%define fifo     ecx
+%define dlpStart ebp
+%define ADX      _GlideRoot+pool_fTemp1
+%define ADY      _GlideRoot+pool_fTemp2
 
-         PUBLIC  __grDrawTextureLine_3DNow@8
-__grDrawTextureLine_3DNow@8 PROC NEAR
+proc _grDrawTextureLine_3DNow, 8
 
 ; 227  : {
 ; 228  : #define FN_NAME "grDrawTextureLine"
@@ -1102,16 +1093,16 @@ __grDrawTextureLine_3DNow@8 PROC NEAR
 ; 235  :   GR_FLUSH_STATE();
 
     push      esi                        ; save caller's register variable
-    mov       gc, [__GlideRoot + curGC]  ; get current graphics context
+    mov       gc, [_GlideRoot + curGC]  ; get current graphics context
 
     push      edi                        ; save caller's register variable
-    mov       vb, _b$[esp-8]             ; b
+    mov       vb, [_b$ + esp-8]             ; b
 
     push      ebx                        ; save caller's register variable
     push      ebp                        ; save caller's frame pointer
 
     femms                                ; we'll use MMX; empty FPU/MMX state    
-    mov       va, _a$[esp]               ; a
+    mov       va, [_a$ + esp]               ; a
 
 ; 236  : 
 ; 237  :   {
@@ -1150,7 +1141,7 @@ __grDrawTextureLine_3DNow@8 PROC NEAR
     mov       i, [ADY]                   ; i = *(long *)&ADY
 
     test      i, i                       ; i < 0 ?
-    jge       $dont_swap_ij              ; nope, no need to swap i and j 
+    jge       .dont_swap_ij              ; nope, no need to swap i and j
 
     xor       va, vb                     ; va ^ vb
     xor       vb, va                     ; vb ^ (va ^ vb) = va
@@ -1158,7 +1149,7 @@ __grDrawTextureLine_3DNow@8 PROC NEAR
     xor       va, vb                     ; (va ^ vb) ^ va = vb
     xor       i, 80000000h               ; i ^= 0x80000000
 
-$dont_swap_ij:
+.dont_swap_ij:
 
 ; 253  :     
 ; 254  :     DX = FARRAY(b, 0) - FARRAY(a, 0);
@@ -1176,32 +1167,32 @@ $dont_swap_ij:
     and       j, 7fffffffh               ; j = abs(j)
 
     cmp       j, i                       ; j < i ?
-    jl        $j_lt_i                    ; yup
+    jl        .j_lt_i                    ; yup
 
     test      j, j                       ; j == 0 ?
-    jz        $line_all_done             ; yup, nothing to draw
+    jz        .line_all_done             ; yup, nothing to draw
 
-$j_lt_i:
+.j_lt_i:
 
 ; 264  :     vSize = gc->state.vData.vSize + 8;
 ; 265  :     GR_SET_EXPECTED_SIZE((vSize<< 2), 1);
 
-    lea       eax, DWORD PTR [eax*4+36]  ; we have vertices + 4 bytes for header
+    lea       eax, [eax*4+36]  ; we have vertices + 4 bytes for header
     cmp       ecx, eax                   ; fifo room avail >= fifo room required ?
 
-    jge       $enough_fifo_room          ; yup, sufficient fifo room
+    jge       .enough_fifo_room          ; yup, sufficient fifo room
     push      j                          ; preserve j
 
-    push      @Line                      ; line number in source file
+    push      __LINE__                   ; line number in source file
     push      0                          ; pointer to filename = NULL
 
     push      eax                        ; fifo space required
-    call      __FifoMakeRoom             ; allocate new fifo space (modified fifoPtr)
+    call      _FifoMakeRoom             ; allocate new fifo space (modified fifoPtr)
 
     add       esp, 12                    ; pop 3 DWORD parameters off stack
     pop       j                          ; restore j
 
-$enough_fifo_room:
+.enough_fifo_room:
 
 ; 266  :     TRI_STRIP_BEGIN(kSetupCullDisable | kSetupStrip, 4, vSize, 
 ; 267  :                     SSTCP_PKT3_BDDDDD | (1<<15));
@@ -1211,11 +1202,11 @@ $enough_fifo_room:
     mov       fifo, [gc+fifoPtr]         ; gc->cmdTransportInfo.fifoPtr
     pxor      mm3, mm3                   ; 0 | 0
 
-    movd      mm2, [__GlideRoot+pool_fHalf] ; 0 | _GlideRoot.pool.fHalf
+    movd      mm2, [_GlideRoot+pool_fHalf] ; 0 | _GlideRoot.pool.fHalf
     movq      mm4, [_F256_F256]          ; 256.0f | 256.0f
 
     test      fifo, 4                    ; fifo QWORD aligned ?
-    jz        $drawline_fifo_aligned     ; yup
+    jz        .drawline_fifo_aligned     ; yup
 
     add       fifo, 4                    ; fifoPtr += sizeof(FxU32)
     mov       eax, [gc+cullStripHdr]     ; gc->cmdTransportInfo.cullStripHdr
@@ -1227,7 +1218,7 @@ $enough_fifo_room:
     cmp       j, i                       ; j < i ?
 
     mov       [fifo-4], eax              ; PCI write header (fifo now aligned)
-    jl        $j_lt_i2_WB0               ; yup, j < i
+    jl        .j_lt_i2_WB0               ; yup, j < i
 
 ; 270  :       TRI_SETF(FARRAY(b, 0));
 ; 271  :       dataElem = 0;
@@ -1252,14 +1243,14 @@ $enough_fifo_room:
     movq      [fifo-8], mm6              ; PCI write FARRAY(b,4)-_GlideRoot.pool.fHalf | FARRAY(b,0)
 
     test      eax, eax                   ; i != GR_DLIST_END ?
-    je        $vertex_loop1_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
+    je        .vertex_loop1_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop1_WB0:
+.vertex_loop1_WB0:
     movd      mm7, [vb + eax]            ; FARRAY(b,i)
     mov       eax, [dlp]                 ; get next offset from offset list
 
     test      eax, eax                   ; offset == 0 (end of offset list) ?
-    jz        $vertex_loop1_done_WB1     ; nope, output next vertex component
+    jz        .vertex_loop1_done_WB1     ; nope, output next vertex component
 
     movd      mm6, [vb + eax]            ; FARRAY(b,i)
     add       dlp, 8                     ; point to next entry in offset list
@@ -1271,9 +1262,9 @@ $vertex_loop1_WB0:
     test      eax, eax                   ; offset == 0 (end of offset list) ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jnz       $vertex_loop1_WB0          ; offset != 0, process next component
+    jnz       .vertex_loop1_WB0          ; offset != 0, process next component
 
-$vertex_loop1_done_WB0:
+.vertex_loop1_done_WB0:
 
 ; 279  :       TRI_SETF(0.f);
 ; 280  :       TRI_SETF(0.f);
@@ -1301,14 +1292,14 @@ $vertex_loop1_done_WB0:
     movq      [fifo-8], mm6              ; PCI write FARRAY(a,4)-_GlideRoot.pool.fHalf | FARRAY(a,0)
 
     cmp       eax, 0                     ; i == GR_DLIST_END ?
-    je        $vertex_loop2_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
+    je        .vertex_loop2_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop2_WB0:
+.vertex_loop2_WB0:
     movd      mm7, [va + eax]            ; FARRAY(a,i)
     mov       eax, [dlp]                 ; get next offset from offset list
 
     test      eax, eax                   ; offset == 0 (end of offset list) ?
-    jz        $vertex_loop2_done_WB1     ; nope, output next vertex component
+    jz        .vertex_loop2_done_WB1     ; nope, output next vertex component
 
     movd      mm6, [va + eax]            ; FARRAY(a,i)
     add       dlp, 8                     ; point to next entry in offset list
@@ -1320,9 +1311,9 @@ $vertex_loop2_WB0:
     test      eax, eax                   ; offset == 0 (end of offset list) ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jnz       $vertex_loop2_WB0          ; offset != 0, process next component
+    jnz       .vertex_loop2_WB0          ; offset != 0, process next component
 
-$vertex_loop2_done_WB0:
+.vertex_loop2_done_WB0:
 
 ; 291  :       TRI_SETF(0.f);
 ; 292  :       TRI_SETF(0.f);
@@ -1350,14 +1341,14 @@ $vertex_loop2_done_WB0:
     test      eax, eax                   ; i != GR_DLIST_END ?
 
     movq      [fifo-8], mm6              ; PCI write FARRAY(b,4)+_GlideRoot.pool.fHalf | FARRAY(b,0)
-    je        $vertex_loop3_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
+    je        .vertex_loop3_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop3_WB0:
+.vertex_loop3_WB0:
     movd      mm7, [vb + eax]            ; FARRAY(b, i)
     mov       eax, [dlp]                 ; get next offset from offset list
 
     test      eax, eax                   ; offset == 0 (end of offset list) ?
-    jz        $vertex_loop3_done_WB1     ; nope, output next vertex component
+    jz        .vertex_loop3_done_WB1     ; nope, output next vertex component
 
     movd      mm6, [vb + eax]            ; FARRAY(b, i)
     add       dlp, 8                     ; point to next entry in offset list
@@ -1369,9 +1360,9 @@ $vertex_loop3_WB0:
     cmp       eax, 0                     ; offset == 0 (end of offset list) ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jne       $vertex_loop3_WB0          ; offset != 0, process next component
+    jne       .vertex_loop3_WB0          ; offset != 0, process next component
 
-$vertex_loop3_done_WB0:
+.vertex_loop3_done_WB0:
 
 ; 303  :       TRI_SETF(256.f);
 ; 304  :       TRI_SETF(0.f);
@@ -1401,14 +1392,14 @@ $vertex_loop3_done_WB0:
     movq      [fifo-8], mm6              ; PCI write FARRAY(a,4)+_GlideRoot.pool.fHalf | FARRAY(a,0)
 
     test      eax, eax                   ; i != GR_DLIST_END ?
-    jz        $vertex_loop4_done_WB0     ; i == GR_DLIST_END, done with all four vertices
+    jz        .vertex_loop4_done_WB0     ; i == GR_DLIST_END, done with all four vertices
 
-$vertex_loop4_WB0:
+.vertex_loop4_WB0:
     movd      mm7, [va + eax]            ; FARRAY(a, i)
     mov       eax, [dlp]                 ; get next offset from offset list
 
     test      eax, eax                   ; offset == 0 (end of offset list) ?
-    jz        $vertex_loop4_done_WB1     ; nope, output next vertex component
+    jz        .vertex_loop4_done_WB1     ; nope, output next vertex component
 
     movd      mm6, [va + eax]            ; FARRAY(a, i)
     add       dlp, 8                     ; point to next entry in offset list
@@ -1420,16 +1411,16 @@ $vertex_loop4_WB0:
     test      eax, eax                   ; offset == 0 (end of offset list) ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jnz       $vertex_loop4_WB0          ; offset != 0, process next component
+    jnz       .vertex_loop4_WB0          ; offset != 0, process next component
 
-$vertex_loop4_done_WB0:
+.vertex_loop4_done_WB0:
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
     lea       esp, [esp]                 ; filler
 
     movq      [fifo-8], mm4              ; PCI write TRI_SETF(0.f) | TRI_SETF(256.f) 
-    jmp       $line_done_WB0             ; last vertex done
+    jmp       .line_done_WB0             ; last vertex done
 
-$j_lt_i2_WB0:
+.j_lt_i2_WB0:
 
 ; 317  :     } else { /* y major */
 ; 318  :       TRI_SETF(FARRAY(b, 0) - _GlideRoot.pool.fHalf);
@@ -1455,14 +1446,14 @@ $j_lt_i2_WB0:
     movq      [fifo-8], mm6              ; PCI write FARRAY(b,4)-_GlideRoot.pool.fHalf | FARRAY(b,0)
 
     cmp       eax, 0                     ; i != GR_DLIST_END ?
-    je        $vertex_loop5_done_WB0     ; i == GR_DLIST_END, no more components in vertex
+    je        .vertex_loop5_done_WB0     ; i == GR_DLIST_END, no more components in vertex
 
-$vertex_loop5_WB0:
+.vertex_loop5_WB0:
     movd      mm7, [vb + eax]            ; FARRAY(b,i)
     mov       eax, [dlp]                 ; get next offset from offset list
 
     test      eax, eax                   ; offset == 0 (end of offset list) ?
-    jz        $vertex_loop5_done_WB1     ; nope, output next vertex component
+    jz        .vertex_loop5_done_WB1     ; nope, output next vertex component
 
     movd      mm6, [vb + eax]            ; FARRAY(b,i)
     add       dlp, 8                     ; point to next entry in offset list
@@ -1474,9 +1465,9 @@ $vertex_loop5_WB0:
     test      eax, eax                   ; offset == 0 (end of offset list) ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jnz       $vertex_loop5_WB0          ; offset != 0, process next component
+    jnz       .vertex_loop5_WB0          ; offset != 0, process next component
 
-$vertex_loop5_done_WB0:
+.vertex_loop5_done_WB0:
 
 ; 327  :       TRI_SETF(0.f);
 ; 328  :       TRI_SETF(0.f);
@@ -1504,14 +1495,14 @@ $vertex_loop5_done_WB0:
     movq      [fifo-8], mm6              ; PCI write FARRAY(a,4) | FARRAY(a,0)-_GlideRoot.pool.fHalf
 
     test      eax, eax                   ; i != GR_DLIST_END ?
-    jz        $vertex_loop6_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
+    jz        .vertex_loop6_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop6_WB0:
+.vertex_loop6_WB0:
     movd      mm7, [va + eax]            ; get next vertex component
     mov       eax, [dlp]                 ; get next offset from offset list
 
     cmp       eax, 0                     ; offset == 0 (end of offset list) ?
-    je        $vertex_loop6_done_WB1     ; nope, output next vertex component
+    je        .vertex_loop6_done_WB1     ; nope, output next vertex component
 
     movd      mm6, [va + eax]            ; get next vertex component
     add       dlp, 8                     ; point to next entry in offset list
@@ -1523,9 +1514,9 @@ $vertex_loop6_WB0:
     test      eax, eax                   ; offset == 0 (end of offset list) ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jnz       $vertex_loop6_WB0          ; offset != 0, process next component
+    jnz       .vertex_loop6_WB0          ; offset != 0, process next component
 
-$vertex_loop6_done_WB0:
+.vertex_loop6_done_WB0:
 
 ; 339  :       TRI_SETF(0.f);
 ; 340  :       TRI_SETF(0.f);
@@ -1553,15 +1544,15 @@ $vertex_loop6_done_WB0:
     movq      [fifo-8], mm6              ; PCI write FARRAY(b,4) | FARRAY(b,0)+_GlideRoot.pool.fHalf
 
     test      eax, eax                   ; i != GR_DLIST_END ?
-    jz        $vertex_loop7_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
+    jz        .vertex_loop7_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop7_WB0:
+.vertex_loop7_WB0:
 
     movd      mm7, [vb + eax]            ; FARRAY(b, i)
     mov       eax, [dlp]                 ; get next offset from offset list
 
     test      eax, eax                   ; offset == 0 (end of offset list) ?
-    jz        $vertex_loop7_done_WB1     ; nope, output next vertex component
+    jz        .vertex_loop7_done_WB1     ; nope, output next vertex component
 
     movd      mm6, [vb + eax]            ; FARRAY(b, i)
     add       dlp, 8                     ; point to next entry in offset list
@@ -1573,9 +1564,9 @@ $vertex_loop7_WB0:
     test      eax, eax                   ; offset == 0 (end of offset list) ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jnz       $vertex_loop7_WB0          ; offset != 0, process next component
+    jnz       .vertex_loop7_WB0          ; offset != 0, process next component
 
-$vertex_loop7_done_WB0:
+.vertex_loop7_done_WB0:
 
 ; 351  :       TRI_SETF(256.f);
 ; 352  :       TRI_SETF(0.f);
@@ -1605,14 +1596,14 @@ $vertex_loop7_done_WB0:
     movq      [fifo-8], mm6              ; PCI write FARRAY(a,4)+_GlideRoot.pool.fHalf | FARRAY(a,0)
 
     test      eax, eax                   ; i != GR_DLIST_END ?
-    jz        $vertex_loop8_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
+    jz        .vertex_loop8_done_WB0     ; i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop8_WB0:
+.vertex_loop8_WB0:
     movd      mm7, [va + eax]            ; FARRAY(a, i)
     mov       eax, [dlp]                 ; get next offset from offset list
 
     cmp       eax, 0                     ; offset == 0 (end of offset list) ?
-    je        $vertex_loop8_done_WB1     ; nope, output next vertex component
+    je        .vertex_loop8_done_WB1     ; nope, output next vertex component
 
     movd      mm6, [va + eax]            ; FARRAY(a, i)
     add       dlp, 8                     ; point to next entry in offset list
@@ -1624,17 +1615,17 @@ $vertex_loop8_WB0:
     movq      [fifo-8], mm7              ; PCI write current component | previous component
 
     test      eax, eax                   ; offset == 0 (end of offset list) ?
-    jnz       $vertex_loop8_WB0          ; offset != 0, process next component
+    jnz       .vertex_loop8_WB0          ; offset != 0, process next component
 
-$vertex_loop8_done_WB0:
+.vertex_loop8_done_WB0:
 
     movq      [fifo], mm4                ; PCI write TRI_SETF(0.f) | TRI_SETF(256.f) 
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
     nop                                  ; filler
-    jmp       $line_done_WB0             ; done with line
+    jmp       .line_done_WB0             ; done with line
 
-$drawline_fifo_aligned:
+.drawline_fifo_aligned:
 
     mov       eax, [gc+cullStripHdr]     ; gc->cmdTransportInfo.cullStripHdr
     psllq     mm2, 32                    ; _GlideRoot.pool.fHalf | 0
@@ -1645,7 +1636,7 @@ $drawline_fifo_aligned:
     movd      mm7, eax                   ; move header to "write buffer" = MM7
 
     cmp       j, i                       ; j < i ?
-    jl        $j_lt_i2_WB1               ; yup, j < i
+    jl        .j_lt_i2_WB1               ; yup, j < i
 
 ; 270  :       TRI_SETF(FARRAY(b, 0));
 ; 271  :       dataElem = 0;
@@ -1673,9 +1664,9 @@ $drawline_fifo_aligned:
     movq      mm7, mm6                   ; FARRAY(b,4)-_GlideRoot.pool.fHalf | FARRAY(b,0)
 
     punpckhdq mm7, mm7                   ; write buffer = FARRAY(b,4)-_GlideRoot.pool.fHalf 
-    jz        $vertex_loop1_done_WB1     ; i == GR_DLIST_END, no further components need to be pushed out
+    jz        .vertex_loop1_done_WB1     ; i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop1_WB1:
+.vertex_loop1_WB1:
     movd      mm6, [vb + eax]            ; FARRAY(b,i)
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
@@ -1686,15 +1677,15 @@ $vertex_loop1_WB1:
     test      eax, eax                   ; i == GR_DLIST_END ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jz        $vertex_loop1_done_WB0     ; yup, i == GR_DLIST_END
+    jz        .vertex_loop1_done_WB0     ; yup, i == GR_DLIST_END
 
     movd      mm7, [vb + eax]            ; FARRAY(b,i)
     mov       eax, [dlp-4]               ; get next offset from offset list
 
     cmp       eax, 0                     ; i == GR_DLIST_END ?
-    jne       $vertex_loop1_WB1          ; nope, more components to handle
+    jne       .vertex_loop1_WB1          ; nope, more components to handle
 
-$vertex_loop1_done_WB1:
+.vertex_loop1_done_WB1:
 
 ; 279  :       TRI_SETF(0.f);
 ; 280  :       TRI_SETF(0.f);
@@ -1728,9 +1719,9 @@ $vertex_loop1_done_WB1:
     lea       dlp, [dlpStart+4]          ; point to next entry in offset list
 
     test      eax, eax                   ; i == GR_DLIST_END ?
-    je        $vertex_loop2_done_WB1     ; i == GR_DLIST_END, no further components need to be pushed out
+    je        .vertex_loop2_done_WB1     ; i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop2_WB1:
+.vertex_loop2_WB1:
     movd      mm6, [va + eax]            ; FARRAY(a,i)
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
@@ -1741,15 +1732,15 @@ $vertex_loop2_WB1:
     test      eax, eax                   ; i == GR_DLIST_END ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jz        $vertex_loop2_done_WB0     ; yup, i == GR_DLIST_END
+    jz        .vertex_loop2_done_WB0     ; yup, i == GR_DLIST_END
 
     movd      mm7, [va + eax]            ; FARRAY(a,i)
     mov       eax, [dlp-4]               ; get next offset from offset list
 
     test      eax, eax                   ; i == GR_DLIST_END ?
-    jnz       $vertex_loop2_WB1          ; nope, more components to handle
+    jnz       .vertex_loop2_WB1          ; nope, more components to handle
 
-$vertex_loop2_done_WB1:
+.vertex_loop2_done_WB1:
 
 ; 291  :       TRI_SETF(0.f);
 ; 292  :       TRI_SETF(0.f);
@@ -1783,9 +1774,9 @@ $vertex_loop2_done_WB1:
     lea       dlp, [dlpStart+4]          ; point to next entry in offset list
 
     test      eax, eax                   ; i == GR_DLIST_END ?
-    jz        $vertex_loop3_done_WB1     ; yup, i == GR_DLIST_END, no further components need to be pushed out
+    jz        .vertex_loop3_done_WB1     ; yup, i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop3_WB1:
+.vertex_loop3_WB1:
     movd      mm6, [vb + eax]            ; FARRAY(b,i)
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
@@ -1796,15 +1787,15 @@ $vertex_loop3_WB1:
     test      eax, eax                   ; i == GR_DLIST_END ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jz        $vertex_loop3_done_WB0     ; yup, i == GR_DLIST_END
+    jz        .vertex_loop3_done_WB0     ; yup, i == GR_DLIST_END
 
     movd      mm7, [vb + eax]            ; FARRAY(b,i)
     mov       eax, [dlp-4]               ; get next offset from offset list
 
     cmp       eax, 0                     ; i == GR_DLIST_END ?
-    jnz       $vertex_loop3_WB1          ; nope, more components to handle
+    jnz       .vertex_loop3_WB1          ; nope, more components to handle
 
-$vertex_loop3_done_WB1:
+.vertex_loop3_done_WB1:
 
 ; 303  :       TRI_SETF(256.f);
 ; 304  :       TRI_SETF(0.f);
@@ -1840,9 +1831,9 @@ $vertex_loop3_done_WB1:
     lea       dlp, [dlpStart+4]          ; point to next entry in offset list
 
     test      eax, eax                   ; i == GR_DLIST_END ?
-    jz        $vertex_loop4_WB1          ; i == GR_DLIST_END, done with all four vertices
+    jz        .vertex_loop4_WB1          ; i == GR_DLIST_END, done with all four vertices
 
-$vertex_loop4_WB1:
+.vertex_loop4_WB1:
     movd      mm6, [va + eax]            ; FARRAY(a,i)
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
@@ -1853,15 +1844,15 @@ $vertex_loop4_WB1:
     test      eax, eax                   ; i == GR_DLIST_END ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jz        $vertex_loop4_done_WB0     ; yup, i == GR_DLIST_END
+    jz        .vertex_loop4_done_WB0     ; yup, i == GR_DLIST_END
 
     movd      mm7, [va + eax]            ; FARRAY(a,i)
     mov       eax, [dlp-4]               ; get next offset from offset list
 
     test      eax, eax                   ; i == GR_DLIST_END ?
-    jnz       $vertex_loop4_WB1          ; nope, more components to handle
+    jnz       .vertex_loop4_WB1          ; nope, more components to handle
 
-$vertex_loop4_done_WB1:
+.vertex_loop4_done_WB1:
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
     punpckldq mm7, mm4                   ; previous component | TRI_SETF(256.f)
 
@@ -1869,9 +1860,9 @@ $vertex_loop4_done_WB1:
     movq      mm7, mm4                   ; TRI_SETF(0.f) | TRI_SETF(256.f)
 
     punpckhdq mm7, mm7                   ; write buffer = TRI_SETF(0.f)
-    jmp       $line_done_WB1             ; last vertex done
+    jmp       .line_done_WB1             ; last vertex done
 
-$j_lt_i2_WB1:
+.j_lt_i2_WB1:
 
 ; 317  :     } else { /* y major */
 ; 318  :       TRI_SETF(FARRAY(b, 0) - _GlideRoot.pool.fHalf);
@@ -1903,9 +1894,9 @@ $j_lt_i2_WB1:
     cmp       eax, 0                     ; i != GR_DLIST_END ?
 
     nop                                  ; filler
-    je        $vertex_loop5_done_WB1     ; i == GR_DLIST_END, no more components in vertex
+    je        .vertex_loop5_done_WB1     ; i == GR_DLIST_END, no more components in vertex
 
-$vertex_loop5_WB1:
+.vertex_loop5_WB1:
     movd      mm6, [vb + eax]            ; FARRAY(b,i)
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
@@ -1916,15 +1907,15 @@ $vertex_loop5_WB1:
     test      eax, eax                   ; i == GR_DLIST_END ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jz        $vertex_loop5_done_WB0     ; yup, i == GR_DLIST_END
+    jz        .vertex_loop5_done_WB0     ; yup, i == GR_DLIST_END
 
     movd      mm7, [vb + eax]            ; FARRAY(b,i)
     mov       eax, [dlp-4]               ; get next offset from offset list
 
     test      eax, eax                   ; i == GR_DLIST_END ?
-    jnz       $vertex_loop5_WB1          ; nope, more components to handle
+    jnz       .vertex_loop5_WB1          ; nope, more components to handle
 
-$vertex_loop5_done_WB1:
+.vertex_loop5_done_WB1:
 
 ; 327  :       TRI_SETF(0.f);
 ; 328  :       TRI_SETF(0.f);
@@ -1958,9 +1949,9 @@ $vertex_loop5_done_WB1:
     lea       dlp, [dlpStart+4]          ; point to next entry in offset list
 
     test      eax, eax                   ; i == GR_DLIST_END ?
-    jz        $vertex_loop6_done_WB1     ; i == GR_DLIST_END, no further components need to be pushed out
+    jz        .vertex_loop6_done_WB1     ; i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop6_WB1:
+.vertex_loop6_WB1:
     movd      mm6, [va + eax]            ; FARRAY(a,i)
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
@@ -1971,15 +1962,15 @@ $vertex_loop6_WB1:
     movq      [fifo-8], mm7              ; PCI write current component | previous component
 
     test      eax, eax                   ; i == GR_DLIST_END ?
-    jz        $vertex_loop6_done_WB0     ; yup, i == GR_DLIST_END
+    jz        .vertex_loop6_done_WB0     ; yup, i == GR_DLIST_END
 
     movd      mm7, [va + eax]            ; FARRAY(a,i)
     mov       eax, [dlp-4]               ; get next offset from offset list
 
     cmp       eax, 0                     ; i == GR_DLIST_END ?
-    jne       $vertex_loop6_WB1          ; nope, more components to handle
+    jne       .vertex_loop6_WB1          ; nope, more components to handle
 
-$vertex_loop6_done_WB1:
+.vertex_loop6_done_WB1:
 
 ; 339  :       TRI_SETF(0.f);
 ; 340  :       TRI_SETF(0.f);
@@ -2013,9 +2004,9 @@ $vertex_loop6_done_WB1:
     lea       dlp, [dlpStart+4]          ; point to next entry in offset list
 
     cmp       eax, 0                     ; i == GR_DLIST_END ?
-    je        $vertex_loop7_done_WB1     ; i == GR_DLIST_END, no further components need to be pushed out
+    je        .vertex_loop7_done_WB1     ; i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop7_WB1:
+.vertex_loop7_WB1:
     movd      mm6, [vb + eax]            ; FARRAY(b,i)
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
@@ -2026,15 +2017,15 @@ $vertex_loop7_WB1:
     cmp       eax, 0                     ; i == GR_DLIST_END ?
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jz        $vertex_loop7_done_WB0     ; yup, i == GR_DLIST_END
+    jz        .vertex_loop7_done_WB0     ; yup, i == GR_DLIST_END
 
     movd      mm7, [vb + eax]            ; FARRAY(b,i)
     mov       eax, [dlp-4]               ; get next offset from offset list
 
     test      eax, eax                   ; i == GR_DLIST_END ?
-    jnz       $vertex_loop7_WB1          ; nope, more components to handle
+    jnz       .vertex_loop7_WB1          ; nope, more components to handle
 
-$vertex_loop7_done_WB1:
+.vertex_loop7_done_WB1:
 
 ; 351  :       TRI_SETF(256.f);
 ; 352  :       TRI_SETF(0.f);
@@ -2070,9 +2061,9 @@ $vertex_loop7_done_WB1:
     lea       dlp, [dlpStart+4]          ; point to next entry in offset list
 
     test      eax, eax                   ; i == GR_DLIST_END ?
-    jz        $vertex_loop8_done_WB1     ; i == GR_DLIST_END, no further components need to be pushed out
+    jz        .vertex_loop8_done_WB1     ; i == GR_DLIST_END, no further components need to be pushed out
 
-$vertex_loop8_WB1:
+.vertex_loop8_WB1:
     movd      mm6, [va + eax]            ; FARRAY(a,i)
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
@@ -2083,15 +2074,15 @@ $vertex_loop8_WB1:
     punpckldq mm7, mm6                   ; current component | previous component
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jz        $vertex_loop8_done_WB0     ; yup, i == GR_DLIST_END
+    jz        .vertex_loop8_done_WB0     ; yup, i == GR_DLIST_END
 
     movd      mm7, [va + eax]            ; FARRAY(a,i)
     mov       eax, [dlp-4]               ; get next offset from offset list
 
     cmp       eax, 0                     ; i == GR_DLIST_END ?
-    jnz       $vertex_loop8_WB1          ; nope, more components to handle
+    jnz       .vertex_loop8_WB1          ; nope, more components to handle
 
-$vertex_loop8_done_WB1:
+.vertex_loop8_done_WB1:
 
     punpckldq mm7, mm4                   ; 256.0f | previous component
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
@@ -2102,12 +2093,12 @@ $vertex_loop8_done_WB1:
     punpckhdq mm7, mm7                   ; write buffer = 256.0f
     nop                                  ; filler
 
-$line_done_WB1:
+.line_done_WB1:
 
     movd      [fifo], mm7                ; flush "write buffer"
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
 
-$line_done_WB0:
+.line_done_WB0:
 
 ; 365  :     }
 ; 366  :     TRI_END;
@@ -2126,20 +2117,20 @@ $line_done_WB0:
     nop                                  ; filler
 
     sub       eax, fifo                  ; fifo space used up
-    mov       ebx, [__GlideRoot+stats_linesDrawn]     ; _GlideRoot.stats.linesDrawn
+    mov       ebx, [_GlideRoot+stats_linesDrawn]     ; _GlideRoot.stats.linesDrawn
 
     add       edx, eax                   ; fifo space available now
-    mov       ebp, [__GlideRoot+stats_otherTrisDrawn]     ; _GlideRoot.stats.othertrisDrawn
+    mov       ebp, [_GlideRoot+stats_otherTrisDrawn]     ; _GlideRoot.stats.othertrisDrawn
 
     mov       [gc + fifoRoom], edx       ; save available fifo space
     inc       ebx                        ; _GlideRoot.stats.linesDrawn++
 
-    mov       [__GlideRoot+stats_linesDrawn], ebx     ; save _GlideRoot.stats.linesDrawn
+    mov       [_GlideRoot+stats_linesDrawn], ebx     ; save _GlideRoot.stats.linesDrawn
     add       ebp, 2                     ; _GlideRoot.stats.othertrisDrawn+=2
     
-    mov       [__GlideRoot+stats_otherTrisDrawn], ebp     ; _GlideRoot.stats.othertrisDrawn+=2
+    mov       [_GlideRoot+stats_otherTrisDrawn], ebp     ; _GlideRoot.stats.othertrisDrawn+=2
     nop                                  ; filler
-$line_all_done:
+.line_all_done:
 
     femms                                ; done with MMX; empty FPU/MMX state
 
@@ -2150,14 +2141,13 @@ $line_all_done:
     pop     esi                          ; restore caller's register variable
 
     ret     8                            ; return and pop 2 DWORD parameters
-__grDrawTextureLine_3DNow@8 ENDP
+endp
 
-        PUBLIC  __grDrawTriangles_3DNow@12
-__grDrawTriangles_3DNow@12 PROC NEAR
+proc _grDrawTriangles_3DNow, 12
 
-_mode     = 20
-_count    = 24
-_pointers = 28    
+%define _mode     20
+%define _count    24
+%define _pointers 28
 
 ; 930  : {
 ; 931  : #define FN_NAME "_grDrawTriangles_3DNow"
@@ -2177,16 +2167,16 @@ _pointers = 28
 ; 945  : 
 ; 946  :   GR_FLUSH_STATE();
 
-gc            TEXTEQU  <edi>             ; points to graphics context
-fifo          TEXTEQU  <ecx>             ; points to next entry in fifo
-dlp           TEXTEQU  <ebp>             ; points to dataList structure
-vertexCount   TEXTEQU  <esi>             ; Current vertex counter in the packet
-vertexPtr     TEXTEQU  <ebx>             ; Current vertex pointer (in deref mode)
-vertex        TEXTEQU  <ebx>             ; Current vertex (in non-deref mode)
-dlpStart      TEXTEQU  <edx>             ; Pointer to start of offset list
+%define gc            edi             ; points to graphics context
+%define fifo          ecx             ; points to next entry in fifo
+%define dlp           ebp             ; points to dataList structure
+%define vertexCount   esi             ; Current vertex counter in the packet
+%define vertexPtr     ebx             ; Current vertex pointer (in deref mode)
+%define vertex        ebx             ; Current vertex (in non-deref mode)
+%define dlpStart      edx             ; Pointer to start of offset list
 
     push      edi                        ; save caller's register variable
-    mov       gc, [__GlideRoot + curGC]  ; get current graphics context
+    mov       gc, [_GlideRoot + curGC]  ; get current graphics context
 
     push      esi                        ; save caller's register variable
     mov       vertexCount, [esp+_count-8]; number of vertices in triangles
@@ -2197,7 +2187,7 @@ dlpStart      TEXTEQU  <edx>             ; Pointer to start of offset list
     mov       vertexPtr, [esp+_pointers] ; get current vertex pointer (deref mode)
     test      vertexCount, vertexCount   ; number of vertices <= 0 ?
     
-    jle       $tris_done                 ; yup, triangles are done
+    jle       .tris_done                 ; yup, triangles are done
 
 ; 947  : 
 ; 948  : #ifdef GLIDE_DEBUG
@@ -2233,17 +2223,17 @@ dlpStart      TEXTEQU  <edx>             ; Pointer to start of offset list
 
     mul       ebp                        ; edx:eax = 1/3*2*2^32*count; edx = 1/3*2*count
 
-    mov       eax, [__GlideRoot+trisProcessed] ; trisProcessed
+    mov       eax, [_GlideRoot+trisProcessed] ; trisProcessed
     shr       edx, 1                     ; count/3
 
     add       eax, edx                   ; trisProcessed += count/3
     mov       edx, [esp + _mode]         ; get mode (0 or 1)
 
     mov       ecx, [gc + CoordinateSpace]; coordinates space (window/clip)
-    mov       [__GlideRoot+trisProcessed], eax ; trisProcessed
+    mov       [_GlideRoot+trisProcessed], eax ; trisProcessed
 
     test      edx, edx                   ; mode 0 (array of vertices) ?
-    jnz       $deref_mode                ; nope, it's mode 1 (array of pointers to vertices)
+    jnz       .deref_mode                ; nope, it's mode 1 (array of pointers to vertices)
 
     mov       edx, [gc + vertexStride]   ; get stride in DWORDs
     nop                                  ; filler
@@ -2252,7 +2242,7 @@ dlpStart      TEXTEQU  <edx>             ; Pointer to start of offset list
     cmp       ecx, 0                     ; coordinate space == 0 (window) ?
 
     mov       [strideinbytes], edx       ; save off stride (in bytes)
-    jnz       $clip_coordinates_ND       ; nope, coordinate space != window  
+    jnz       .clip_coordinates_ND       ; nope, coordinate space != window
 
 ; 961  :     while (count > 0) {
 ; 962  :       FxI32 vcount = count >=15 ? 15 : count;
@@ -2260,7 +2250,7 @@ dlpStart      TEXTEQU  <edx>             ; Pointer to start of offset list
 ; 964  :       TRI_STRIP_BEGIN(kSetupStrip, vcount, gc->state.vData.vSize, SSTCP_PKT3_BDDBDD);
 ; 965  :       
 
-$win_coords_loop_ND:
+.win_coords_loop_ND:
 
     sub       vertexCount, 15            ; vertexCount >= 15 ? CF=0 : CF=1
     mov       ecx, [gc + vertexSize]     ; bytes of data for each vertex 
@@ -2276,17 +2266,17 @@ $win_coords_loop_ND:
     add       ecx, 4                     ; add header size ==> total packet size
 
     cmp       eax, ecx                   ; fifo space avail >= packet size ?
-    jge       $win_tri_begin_ND          ; yup, start writing triangle data
+    jge       .win_tri_begin_ND          ; yup, start writing triangle data
 
-    push      @Line                      ; line number inside this function
+    push      __LINE__                   ; line number inside this function
     push      0h                         ; pointer to function name = NULL
 
     push      ecx                        ; fifo space needed
-    call      __FifoMakeRoom             ; note: updates fifoPtr
+    call      _FifoMakeRoom             ; note: updates fifoPtr
 
     add       esp, 12                    ; remove 3 DWORD arguments from stack
 
-$win_tri_begin_ND:
+.win_tri_begin_ND:
 
     mov       eax, vertexCount           ; number of vertices in triangles
     mov       fifo, [gc + fifoPtr]       ; get fifoPtr
@@ -2300,7 +2290,7 @@ $win_tri_begin_ND:
     lea       dlpStart, [gc+tsuDataList] ; pointer to start of offset list
     test      fifo, ebp                  ; fifoPtr QWORD aligned ?
 
-    jz        $fifo_aligned_ND           ; yup
+    jz        .fifo_aligned_ND           ; yup
 
     mov       [fifo], eax                ; PCI write packet type
     add       fifo, 4                    ; fifo pointer now QWORD aligned
@@ -2329,7 +2319,7 @@ $win_tri_begin_ND:
 ; 987  :       count -= 15;
 ; 988  :     }
 
-$win_vertex_loop_ND_WB0:                 ; nothing in "write buffer"
+.win_vertex_loop_ND_WB0:                 ; nothing in "write buffer"
 
     mov       eax, [dlpStart]            ; get first offset from offset list
     mov       dlp, dlpStart              ; point to start of offset list
@@ -2341,15 +2331,15 @@ $win_vertex_loop_ND_WB0:                 ; nothing in "write buffer"
     test      eax, eax                   ; if offset == 0, end of list
 
     movq      [fifo-8], mm1              ; PCI write x, y
-    jz        $win_datalist_end_ND_WB0   ; no more vertex data, nothing in "write buffer" 
+    jz        .win_datalist_end_ND_WB0   ; no more vertex data, nothing in "write buffer"
 
-$win_datalist_loop_ND_WB0:               ; nothing in "write buffer"
+.win_datalist_loop_ND_WB0:               ; nothing in "write buffer"
 
     movd      mm1, [vertex + eax]        ; get next parameter
     mov       eax, [dlp]                 ; get next offset from offset list
 
     cmp       eax, 0                     ; at end of offset list (offset == 0) ?
-    jz        $win_datalist_end_ND_WB1   ; exit, write buffer contains one DWORD
+    jz        .win_datalist_end_ND_WB1   ; exit, write buffer contains one DWORD
 
     movd      mm2, [vertex + eax]        ; get next parameter
     add       dlp, 8                     ; dlp++
@@ -2361,17 +2351,17 @@ $win_datalist_loop_ND_WB0:               ; nothing in "write buffer"
     punpckldq mm1, mm2                   ; current param | previous param
 
     movq      [fifo-8], mm1              ; PCI write current param | previous param
-    jnz       $win_datalist_loop_ND_WB0  ; nope, copy next parameter
+    jnz       .win_datalist_loop_ND_WB0  ; nope, copy next parameter
 
-$win_datalist_end_ND_WB0:
+.win_datalist_end_ND_WB0:
 
     mov       eax, [strideinbytes]       ; get offset to next vertex
     dec       vertexCount                ; another vertex done. Any left?
 
     lea       vertex, [vertex + eax]     ; points to next vertex
-    jnz       $win_vertex_loop_ND_WB0    ; yup, output next vertex
+    jnz       .win_vertex_loop_ND_WB0    ; yup, output next vertex
 
-$win_vertex_end_ND_WB0:
+.win_vertex_end_ND_WB0:
 
     mov       eax, [gc + fifoPtr]        ; old fifoPtr
     mov       ebp, [gc + fifoRoom]       ; old number of bytes available in fifo
@@ -2389,7 +2379,7 @@ $win_vertex_end_ND_WB0:
     test      vertexCount, vertexCount   ; any vertices left to process ?
 
     nop                                  ; filler
-    jg        $win_coords_loop_ND        ; loop if number of vertices to process >= 0
+    jg        .win_coords_loop_ND        ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state
 
@@ -2401,11 +2391,11 @@ $win_vertex_end_ND_WB0:
 
     ret       12                         ; return, pop 3 DWORD parameters off stack
 
-$fifo_aligned_ND:
+.fifo_aligned_ND:
 
     movd      mm1, eax                   ; move header into "write buffer"
 
-$win_vertex_loop_ND_WB1:                 ; one DWORD in "write buffer"
+.win_vertex_loop_ND_WB1:                 ; one DWORD in "write buffer"
 
     movd      mm2, [vertex]              ; 0 | x of vertex
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxU32)
@@ -2420,9 +2410,9 @@ $win_vertex_loop_ND_WB1:                 ; one DWORD in "write buffer"
     movd      mm1, [vertex+4]            ; 0 | y of vertex
 
     test      eax, eax                   ; offset == 0 (list empty) ?
-    jz        $win_datalist_end_ND_WB1   ; yup, no more vertex data, one DWORD in "write buffer"
+    jz        .win_datalist_end_ND_WB1   ; yup, no more vertex data, one DWORD in "write buffer"
 
-$win_datalist_loop_ND_WB1:               ; one DWORD in "write buffer" 
+.win_datalist_loop_ND_WB1:               ; one DWORD in "write buffer" 
 
     movd      mm2, [vertex + eax]        ; get next parameter
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxU32)
@@ -2434,23 +2424,23 @@ $win_datalist_loop_ND_WB1:               ; one DWORD in "write buffer"
     cmp       eax, 0                     ; at end of offset list (offset == 0) ?
 
     movq      [fifo-8], mm1              ; PCI write current param | previous param
-    jz        $win_datalist_end_ND_WB0   ; yes, exit, "write buffer" empty
+    jz        .win_datalist_end_ND_WB0   ; yes, exit, "write buffer" empty
 
     movd      mm1, [vertex + eax]        ; get next parameter
     mov       eax, [dlp-4]               ; get next offset from offset list
 
     test      eax, eax                   ; at end of offset list (offset == 0) ?
-    jnz       $win_datalist_loop_ND_WB1  ; nope, copy next parameter
+    jnz       .win_datalist_loop_ND_WB1  ; nope, copy next parameter
 
-$win_datalist_end_ND_WB1:
+.win_datalist_end_ND_WB1:
 
     mov       eax, [strideinbytes]       ; get offset to next vertex
     dec       vertexCount                ; another vertex done. Any left?
 
     lea       vertex, [vertex + eax]     ; points to next vertex
-    jnz       $win_vertex_loop_ND_WB1    ; yup, output next vertex
+    jnz       .win_vertex_loop_ND_WB1    ; yup, output next vertex
 
-$win_vertex_end_ND_WB1:
+.win_vertex_end_ND_WB1:
 
     movd      [fifo], mm1                ; flush "write buffer"
     mov       eax, [gc + fifoPtr]        ; old fifoPtr
@@ -2471,7 +2461,7 @@ $win_vertex_end_ND_WB1:
     test      vertexCount, vertexCount   ; any vertices left to process ?
 
     nop                                  ; filler
-    jg        $win_coords_loop_ND        ; loop if number of vertices to process >= 0
+    jg        .win_coords_loop_ND        ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state
 
@@ -2483,14 +2473,14 @@ $win_vertex_end_ND_WB1:
 
     ret       12                         ; return, pop 3 DWORD parameters off stack
 
-$deref_mode:
+.deref_mode:
 
     prefetch  [vertexPtr]                ; pre-load first group of pointers
 
     test      ecx, ecx                   ; coordinate space == 0 (window) ?
-    jnz       $clip_coordinates_D        ; nope, coordinate space != window
+    jnz       .clip_coordinates_D        ; nope, coordinate space != window
 
-$win_coords_loop_D:
+.win_coords_loop_D:
 
     sub       vertexCount, 15            ; vertexCount >= 15 ? CF=0 : CF=1
     mov       ecx, [gc + vertexSize]     ; bytes of data for each vertex 
@@ -2506,18 +2496,18 @@ $win_coords_loop_D:
     add       ecx, 4                     ; add header size ==> total packet size
 
     cmp       eax, ecx                   ; fifo space avail >= packet size ?
-    jge       $win_tri_begin_D           ; yup, start writing triangle data
+    jge       .win_tri_begin_D           ; yup, start writing triangle data
 
-    push      @Line                      ; line number inside this function
+    push      __LINE__                   ; line number inside this function
     push      0h                         ; pointer to function name = NULL
 
     push      ecx                        ; fifo space needed
-    call      __FifoMakeRoom             ; note: updates fifoPtr
+    call      _FifoMakeRoom             ; note: updates fifoPtr
 
     add       esp, 12                    ; remove 3 DWORD arguments from stack
     nop                                  ; filler
 
-$win_tri_begin_D:
+.win_tri_begin_D:
 
     mov       eax, vertexCount           ; number of vertices in triangles
     mov       fifo, [gc + fifoPtr]       ; get fifoPtr
@@ -2532,12 +2522,12 @@ $win_tri_begin_D:
     test      fifo, ebp                  ; fifoPtr QWORD aligned ?
 
     nop                                  ; filler
-    jz        $fifo_aligned_D            ; yup
+    jz        .fifo_aligned_D            ; yup
 
     mov       [fifo], eax                ; PCI write packet type
     add       fifo, 4                    ; fifo pointer now QWORD aligned
 
-$win_vertex_loop_D_WB0:                  ; nothing in "write buffer"
+.win_vertex_loop_D_WB0:                  ; nothing in "write buffer"
 
     mov       edx, [vertexPtr]           ; dereference pointer, edx points to vertex
     add       vertexPtr, 4               ; next pointer
@@ -2552,15 +2542,15 @@ $win_vertex_loop_D_WB0:                  ; nothing in "write buffer"
     add       fifo, 8                    ; fifo += 2
 
     test      eax, eax                   ; if offset == 0, end of offset list
-    je        $win_datalist_end_D_WB0    ; no more vertex data, nothing in "write buffer" 
+    je        .win_datalist_end_D_WB0    ; no more vertex data, nothing in "write buffer"
 
-$win_datalist_loop_D_WB0:                ; nothing in "write buffer"
+.win_datalist_loop_D_WB0:                ; nothing in "write buffer"
 
     movd      mm1, [edx + eax]           ; get next parameter
     mov       eax, [dlp]                 ; get next offset from offset list
 
     test      eax, eax                   ; at end of offset list (offset == 0) ?
-    jz        $win_datalist_end_D_WB1    ; exit, write buffer contains one DWORD
+    jz        .win_datalist_end_D_WB1    ; exit, write buffer contains one DWORD
 
     movd      mm2, [edx + eax]           ; get next parameter
     add       dlp, 8                     ; dlp++
@@ -2572,14 +2562,14 @@ $win_datalist_loop_D_WB0:                ; nothing in "write buffer"
     cmp       eax, 0                     ; at end of offset list (offset == 0) ?
 
     movq      [fifo-8], mm1              ; PCI write current param | previous param
-    jnz       $win_datalist_loop_D_WB0   ; nope, copy next parameter
+    jnz       .win_datalist_loop_D_WB0   ; nope, copy next parameter
 
-$win_datalist_end_D_WB0:
+.win_datalist_end_D_WB0:
 
     dec       vertexCount                ; another vertex done. Any left?
-    jnz       $win_vertex_loop_D_WB0     ; yup, output next vertex
+    jnz       .win_vertex_loop_D_WB0     ; yup, output next vertex
 
-$win_vertex_end_D_WB0:
+.win_vertex_end_D_WB0:
 
     mov       eax, [gc + fifoPtr]        ; old fifoPtr
     mov       ebp, [gc + fifoRoom]       ; old number of bytes available in fifo
@@ -2597,7 +2587,7 @@ $win_vertex_end_D_WB0:
     test      vertexCount, vertexCount   ; any vertices left to process ?
 
     mov       [esp + _count], vertexCount; remaining number of vertices to process
-    jg        $win_coords_loop_D         ; loop if number of vertices to process >= 0
+    jg        .win_coords_loop_D         ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state
 
@@ -2609,11 +2599,11 @@ $win_vertex_end_D_WB0:
 
     ret       12                         ; return, pop 3 DWORD parameters off stack
 
-$fifo_aligned_D:
+.fifo_aligned_D:
 
     movd      mm1, eax                   ; move header into "write buffer"
 
-$win_vertex_loop_D_WB1:                  ; one DWORD in "write buffer"
+.win_vertex_loop_D_WB1:                  ; one DWORD in "write buffer"
 
     mov       edx, [vertexPtr]           ; dereference pointer, edx points to vertex
     add       vertexPtr, 4               ; next pointer
@@ -2631,9 +2621,9 @@ $win_vertex_loop_D_WB1:                  ; one DWORD in "write buffer"
     movd      mm1, [edx + 4]             ; 0 | y of vertex
 
     cmp       eax, 0                     ; offset == 0 (list empty) ?
-    je        $win_datalist_end_D_WB1    ; yup, no more vertex data, one DWORD in "write buffer"
+    je        .win_datalist_end_D_WB1    ; yup, no more vertex data, one DWORD in "write buffer"
 
-$win_datalist_loop_D_WB1:                ; one DWORD in "write buffer" = MM1
+.win_datalist_loop_D_WB1:                ; one DWORD in "write buffer" = MM1
 
     movd      mm2, [edx + eax]           ; get next parameter
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxU32)
@@ -2645,20 +2635,20 @@ $win_datalist_loop_D_WB1:                ; one DWORD in "write buffer" = MM1
     test      eax, eax                   ; at end of offset list (offset == 0) ?
 
     movq      [fifo-8], mm1              ; PCI write current param | previous param
-    jz        $win_datalist_end_D_WB0    ; yes, exit, "write buffer" empty
+    jz        .win_datalist_end_D_WB0    ; yes, exit, "write buffer" empty
 
     movd      mm1, [edx + eax]           ; get next parameter
     mov       eax, [dlp-4]               ; get next offset from offset list
 
     test      eax,  eax                  ; at end of offset list (offset == 0) ?
-    jnz       $win_datalist_loop_D_WB1   ; nope, copy next parameter
+    jnz       .win_datalist_loop_D_WB1   ; nope, copy next parameter
 
-$win_datalist_end_D_WB1:
+.win_datalist_end_D_WB1:
 
     dec       vertexCount                ; another vertex done. Any left?
-    jnz       $win_vertex_loop_D_WB1     ; yup, output next vertex
+    jnz       .win_vertex_loop_D_WB1     ; yup, output next vertex
 
-$win_vertex_end_D_WB1:
+.win_vertex_end_D_WB1:
 
     movd      [fifo], mm1                ; flush "write buffer"
     mov       eax, [gc + fifoPtr]        ; old fifoPtr
@@ -2679,7 +2669,7 @@ $win_vertex_end_D_WB1:
     cmp       vertexCount, 0             ; any vertices left to process ?
 
     nop                                  ; filler
-    jg        $win_coords_loop_D         ; loop if number of vertices to process >= 0
+    jg        .win_coords_loop_D         ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state
 
@@ -2723,19 +2713,19 @@ $win_vertex_end_D_WB1:
 ; 1016 :       }
 ; 1017 :       TRI_END;
 
-ifndef GLIDE3_SCALER
+%ifndef GLIDE3_SCALER
 
-$clip_coordinates_D:
+.clip_coordinates_D:
 
-    mov       [strideinbytes], 4         ; unit stride for array of pointers to vertices
+    mov       dword [strideinbytes], 4         ; unit stride for array of pointers to vertices
 
-$clip_coordinates_ND:
+.clip_coordinates_ND:
 
-dataElem      textequ <ebp>              ; number of vertex components processed
+%define dataElem      ebp              ; number of vertex components processed
 
-    movd      mm6, [__GlideRoot+pool_f255]      ; GlideRoot.pool.f255 
+    movd      mm6, [_GlideRoot+pool_f255]      ; GlideRoot.pool.f255
 
-$clip_coords_begin:
+.clip_coords_begin:
 
 ;;;     }
 ;;;     else {
@@ -2760,17 +2750,17 @@ $clip_coords_begin:
     add       ecx, 4                     ; add header size ==> total packet size
 
     cmp       eax, ecx                   ; fifo space avail >= packet size ?
-    jge       $clip_tri_begin            ; yup, start writing triangle data
+    jge       .clip_tri_begin            ; yup, start writing triangle data
 
-    push      @Line                      ; line number inside this function
+    push      __LINE__                   ; line number inside this function
     push      0h                         ; pointer to function name = NULL
 
     push      ecx                        ; fifo space needed
-    call      __FifoMakeRoom             ; note: updates fifoPtr
+    call      _FifoMakeRoom             ; note: updates fifoPtr
 
     add       esp, 12                    ; remove 3 DWORD arguments from stack
 
-$clip_tri_begin:
+.clip_tri_begin:
 
     mov       edx, vertexCount           ; number of vertices in triangles
     mov       fifo, [gc + fifoPtr]       ; get fifoPtr
@@ -2783,7 +2773,7 @@ $clip_tri_begin:
     mov       [fifo], edx                ; PCI write packet type
     add       fifo, 4                    ; fifo pointer now QWORD aligned
 
-$clip_for_begin:
+.clip_for_begin:
 
     mov       edx, vertexPtr             ; vertex = vertexPtr (assume no-deref mode)
     mov       eax, [esp+_mode]           ; mode 0 = no deref, mode 1 = deref
@@ -2792,11 +2782,11 @@ $clip_for_begin:
     test      eax, eax                   ; deref mode ?
 
     mov       eax, [gc + wInfo_offset]   ; get offset of W into vertex struct
-    jz        $clip_noderef              ; yup, no-deref mode
+    jz        .clip_noderef              ; yup, no-deref mode
 
     mov       edx, [vertexPtr]           ; vertex = *vertexPtr
 
-$clip_noderef:
+.clip_noderef:
 
     movd      mm0, [edx + eax]           ; 0 | W of current vertex
     pfrcp     mm1, mm0                   ; 0 | 1/W approx
@@ -2826,13 +2816,13 @@ $clip_noderef:
     mov       eax, [gc + tsuDataList]    ; first entry from offset list
 
     movq      [fifo-8], mm2              ; PCI write transformed x, y
-    jz        $clip_setup_ooz            ; nope, no color at all needed
+    jz        .clip_setup_ooz            ; nope, no color at all needed
   
-    cmp       DWORD PTR [gc+colorType], 0; gc->state.vData.colorType == GR_FLOAT ?
-    jne       $clip_setup_pargb          ; nope, packed ARGB format
+    cmp       dword [gc+colorType], 0; gc->state.vData.colorType == GR_FLOAT ?
+    jne       .clip_setup_pargb          ; nope, packed ARGB format
   
     test      esi, 1                     ; STATE_REQUIRES_IT_DRGB ?
-    jz        $clip_setup_a              ; no, but definitely A
+    jz        .clip_setup_a              ; no, but definitely A
 
     movd      mm2, [edx + eax]           ; 0 | r
     mov       eax, [gc + tsuDataList+4]  ; offset of g part of vertex data
@@ -2856,9 +2846,9 @@ $clip_noderef:
     lea       fifo, [fifo+12]            ; fifoPtr += 3*sizeof(FxFloat)
 
     movd      [fifo-4], mm2              ; PCI write b*255
-    jz        $clip_setup_ooz            ; nope, no alpha, proceeed with ooz
+    jz        .clip_setup_ooz            ; nope, no alpha, proceeed with ooz
 
-$clip_setup_a:
+.clip_setup_a:
     movd      mm2, [eax+edx]             ; 0 | a
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
 
@@ -2869,9 +2859,9 @@ $clip_setup_a:
     mov       eax, [gc+dataElem+tsuDataList]; offset of next part of vertex data
 
     movd      [fifo-4], mm2              ; PCI write a*255
-    jmp       $clip_setup_ooz            ; check whether we need to push out z
+    jmp       .clip_setup_ooz            ; check whether we need to push out z
 
-$clip_setup_pargb:
+.clip_setup_pargb:
     movd      mm2, [eax+edx]             ; get packed ARGB data
     add       fifo, 4                    ; fifoPtr += sizeof(FxU32)
 
@@ -2881,10 +2871,10 @@ $clip_setup_pargb:
     movd      [fifo-4], mm2              ; PCI write packed ARGB
     nop                                  ; filler
 
-$clip_setup_ooz:
+.clip_setup_ooz:
   
     test      esi, 4                     ; STATE_REQUIRES_OOZ ?
-    jz        $clip_setup_qow            ; nope
+    jz        .clip_setup_qow            ; nope
 
     movd      mm2, [eax + edx]           ; 0 | z component of vertex
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
@@ -2901,12 +2891,12 @@ $clip_setup_ooz:
     pfadd     mm2, mm4                   ; 0 | TRI_SETF(FARRAY(_s, i)*_oow*gc->state.Viewport.hdepth+gc->state.Viewport.oz
     movd      [fifo-4], mm2              ; PCI write transformed Z
 
-$clip_setup_qow:
+.clip_setup_qow:
     test      esi, 8                     ; STATE_REQUIRES_OOW_FBI ?
-    jz        $clip_setup_qow0           ; nope
+    jz        .clip_setup_qow0           ; nope
 
-    cmp       DWORD PTR [gc+qInfo_mode],0; does vertex have Q component ?
-    je        $clip_setup_oow            ; nope, not Q but W
+    cmp       dword [gc+qInfo_mode],0; does vertex have Q component ?
+    je        .clip_setup_oow            ; nope, not Q but W
 
     mov       eax, [gc + qInfo_offset]   ; offset of Q component of vertex
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
@@ -2918,21 +2908,21 @@ $clip_setup_qow:
     pfmul     mm2, mm0                   ; q*oow
 
     movd      [fifo-4], mm2              ; PCI write transformed Q
-    jmp       $clip_setup_qow0           ; continue with q0
+    jmp       .clip_setup_qow0           ; continue with q0
 
-$clip_setup_oow:
+.clip_setup_oow:
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat) 
     add       dataElem, 4                ; dataElem++
 
     movd      [fifo-4], mm0              ; PCI write oow
     mov       eax,[gc+dataElem+tsuDataList]; pointer to next vertex component
 
-$clip_setup_qow0:
+.clip_setup_qow0:
     test      esi, 16                    ; STATE_REQUIRES_W_TMU0 ?
-    jz        $clip_setup_stow0          ; nope 
+    jz        .clip_setup_stow0          ; nope
 
-    cmp       DWORD PTR [gc+q0Info_mode],0; does vertex have Q component ?
-    je        $clip_setup_oow0           ; nope, not Q but W
+    cmp       dword [gc+q0Info_mode],0; does vertex have Q component ?
+    je        .clip_setup_oow0           ; nope, not Q but W
 
     mov       eax, [gc+q0Info_offset]    ; offset of Q component of vertex
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
@@ -2944,21 +2934,21 @@ $clip_setup_qow0:
     pfmul     mm2, mm0                   ; q0*oow
 
     movd      [fifo-4], mm2              ; PCI write transformed q0
-    jmp       $clip_setup_stow0          ; continue with stow0
+    jmp       .clip_setup_stow0          ; continue with stow0
 
     nop                                  ; filler
 
-$clip_setup_oow0:
+.clip_setup_oow0:
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat) 
     add       dataElem, 4                ; dataElem++
 
     movd      [fifo-4], mm0              ; PCI write oow
     mov       eax,[gc+dataElem+tsuDataList]; pointer to next vertex component
 
-$clip_setup_stow0:
+.clip_setup_stow0:
 
     test      esi, 32                    ; STATE_REQUIRES_ST_TMU0 ?
-    jz        $clip_setup_qow1           ; nope
+    jz        .clip_setup_qow1           ; nope
 
     movq      mm7, [gc + tmu0_s_scale]   ; state.tmu_config[0].t_scale | state.tmu_config[0].s_scale
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
@@ -2978,12 +2968,12 @@ $clip_setup_stow0:
     movq      [fifo-8], mm2              ; PCI write param2*oow*tmu0_t_scale | param1*oow*tmu0_s_scale
     mov       eax, [gc+dataElem+tsuDataList]; pointer to next vertex component
 
-$clip_setup_qow1:
+.clip_setup_qow1:
     test      esi, 64                    ; STATE_REQUIRES_W_TMU1 ?
-    jz        $clip_setup_stow1          ; nope
+    jz        .clip_setup_stow1          ; nope
 
-    cmp       DWORD PTR [gc+q1Info_mode],0; does vertex have Q component ?
-    je        $clip_setup_oow1           ; nope, not Q but W
+    cmp       dword [gc+q1Info_mode],0; does vertex have Q component ?
+    je        .clip_setup_oow1           ; nope, not Q but W
 
     mov       eax, [gc+q1Info_offset]    ; offset of Q component of vertex
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
@@ -2995,22 +2985,22 @@ $clip_setup_qow1:
     pfmul     mm2, mm0                  ; q1*oow
 
     movd      [fifo-4], mm2              ; PCI write transformed q1
-    jmp       $clip_setup_stow1          ; continue with stow1
+    jmp       .clip_setup_stow1          ; continue with stow1
 
-$clip_setup_oow1:
+.clip_setup_oow1:
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat) 
     add       dataElem, 4                ; dataElem++
 
     movd      [fifo-4], mm0              ; PCI write oow
     mov       eax,[gc+dataElem+tsuDataList]; pointer to next vertex component
 
-$clip_setup_stow1:
+.clip_setup_stow1:
 
     test      esi, 128                   ; STATE_REQUIRES_ST_TMU1 ?
     mov       vertexCount, [vertices]    ; get number of vertices
 
     movq      mm7, [gc + tmu1_s_scale]   ; state.tmu_config[1].t_scale | state.tmu_config[1].s_scale
-    jz        $clip_setup_end            ; nope
+    jz        .clip_setup_end            ; nope
 
     movd      mm2, [edx + eax]           ; param1
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
@@ -3024,12 +3014,12 @@ $clip_setup_stow1:
     pfmul     mm2, mm7                   ; param2*oow*state.tmu_config[1].t_scale | param1*oow*state.tmu_config[1].s_scale
     movq      [fifo-8], mm2              ; PCI write param2*oow*state.tmu_config[1].t_scale | param1*oow*state.tmu_config[1].s_scale
 
-$clip_setup_end:
+.clip_setup_end:
 
     dec       vertexCount                ; vcount--
-    jnz       $clip_for_begin            ; until 
+    jnz       .clip_for_begin            ; until
 
-$clip_for_end:
+.clip_for_end:
 
     mov       eax, [gc + fifoPtr]        ; old fifoPtr
     mov       ebp, [gc + fifoRoom]       ; old number of bytes available in fifo
@@ -3046,11 +3036,11 @@ $clip_for_end:
     mov       [esp + _count], vertexCount; remaining number of vertices to process 
     cmp       vertexCount, 0             ; any vertices left to process ?
 
-    jg        $clip_coords_begin         ; loop if number of vertices to process >= 0
+    jg        .clip_coords_begin         ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state
 
-else ; GLIDE3_SCALER
+%else ; GLIDE3_SCALER
 
 ;----------------------------------------------------------------------------
 
@@ -3065,14 +3055,14 @@ else ; GLIDE3_SCALER
 
 ;----------------------------------------------------------------------------
 
-dataElem      textequ <ebp>              ; number of vertex components processed
+%define dataElem      ebp              ; number of vertex components processed
 
     nop                                  ; filler for code alignment
 
-$clip_coordinates_ND:
+.clip_coordinates_ND:
 
     pxor      mm0, mm0                   ; load 0
-    movd      mm1, [__GlideRoot+pool_f255]; GlideRoot.pool.f255 
+    movd      mm1, [_GlideRoot+pool_f255]; GlideRoot.pool.f255
 
     movd      [atab], mm0                ; atable[0] = 0.0f
     movd      mm5, [gc + vp_oz]          ; gc->state.Viewport.oz
@@ -3083,13 +3073,13 @@ $clip_coordinates_ND:
     movq      mm1, [gc + vp_ox]          ; gc->state.Viewport.oy | gc->state.Viewport.ox
     movd      [btab+4], mm5              ; btable[1] = gc->state.Viewport.oz
 
-    movq      QWORD PTR [btab+8], mm0    ; btable[3] = 0.0f | btable[2] = 0.0f
+    movq      QWORD [btab+8], mm0    ; btable[3] = 0.0f | btable[2] = 0.0f
     nop                                  ; filler
 
-    movq      QWORD PTR [btab+16], mm0   ; btable[5] = 0.0f | btable[4] = 0.0f
-    movq      QWORD PTR [btab+24], mm0   ; btable[7] = 0.0f | btable[6] = 0.0f
+    movq      QWORD [btab+16], mm0   ; btable[5] = 0.0f | btable[4] = 0.0f
+    movq      QWORD [btab+24], mm0   ; btable[7] = 0.0f | btable[6] = 0.0f
 
-$clip_coords_begin_ND:
+.clip_coords_begin_ND:
 
     sub       vertexCount, 15            ; vertexCount >= 15 ? CF=0 : CF=1
     mov       ecx, [gc + vertexSize]     ; bytes of data for each vertex 
@@ -3105,18 +3095,18 @@ $clip_coords_begin_ND:
     add       ecx, 4                     ; add header size ==> total packet size
 
     cmp       eax, ecx                   ; fifo space avail >= packet size ?
-    jge       $clip_tri_begin_ND         ; yup, start writing triangle data
+    jge       .clip_tri_begin_ND         ; yup, start writing triangle data
 
-    push      @Line                      ; line number inside this function
+    push      __LINE__                   ; line number inside this function
     push      0h                         ; pointer to function name = NULL
 
     push      ecx                        ; fifo space needed
-    call      __FifoMakeRoom             ; note: updates fifoPtr
+    call      _FifoMakeRoom             ; note: updates fifoPtr
 
     add       esp, 12                    ; remove 3 DWORD arguments from stack
     nop                                  ; filler for code alignment
 
-$clip_tri_begin_ND:
+.clip_tri_begin_ND:
 
     mov       eax, vertexCount           ; number of vertices in triangles
     mov       fifo, [gc + fifoPtr]       ; get fifoPtr
@@ -3130,12 +3120,12 @@ $clip_tri_begin_ND:
     test      fifo, ebp                  ; fifoPtr QWORD aligned ?
 
     mov       edx, [gc + wInfo_offset]   ; gc->state.vData.wInfo.offset
-    jz        $clip_fifo_aligned_ND      ; yup
+    jz        .clip_fifo_aligned_ND      ; yup
 
     mov       [fifo], eax                ; PCI write packet type
     add       fifo, 4                    ; fifo pointer now QWORD aligned
 
-$clip_for_begin_WB0_ND:                  ; "write buffer" = MM7 is empty
+.clip_for_begin_WB0_ND:                  ; "write buffer" = MM7 is empty
 
     ;; here:  ebx = vertex
     ;;        ecx = fifo
@@ -3182,11 +3172,11 @@ $clip_for_begin_WB0_ND:                  ; "write buffer" = MM7 is empty
     movq      [fifo-8], mm7              ; PCI write transformed x, y; write buffer=mm7 empty
     nop                                  ; filler
 
-    movq      QWORD PTR [atab+12], mm3   ; atable[4] = oow*gc->state.tmu_config[0].t_scale | atable[3] = oow*gc->state.tmu_config[0].s_scale
+    movq      QWORD [atab+12], mm3   ; atable[4] = oow*gc->state.tmu_config[0].t_scale | atable[3] = oow*gc->state.tmu_config[0].s_scale
     pfmul     mm4, mm0                   ; oow*gc->state.tmu_config[1].t_scale | oow*gc->state.tmu_config[1].s_scale
 
-    movq      QWORD PTR [atab+20], mm4   ; atable[6] = oow*gc->state.tmu_config[1].t_scale | atable[5] = oow*gc->state.tmu_config[1].s_scale
-    jne       $not_pargb_WB0_ND          ; nope, gc->state.vData.colorType != GR_U8
+    movq      QWORD [atab+20], mm4   ; atable[6] = oow*gc->state.tmu_config[1].t_scale | atable[5] = oow*gc->state.tmu_config[1].s_scale
+    jne       .not_pargb_WB0_ND          ; nope, gc->state.vData.colorType != GR_U8
 
     movd      mm7, [vertex + eax]        ; get packed ARGB data; 1 DWORD in "write buffer"=mm7
     nop                                  ; filler
@@ -3194,10 +3184,10 @@ $clip_for_begin_WB0_ND:                  ; "write buffer" = MM7 is empty
     mov       eax, [gc + tsuDataList + 4]; get offset of next vertex component (after pargb)
     add       dataElem, 4                ; dataElem = 1 (namely pargb)
 
-$not_pargb_WB1_ND:
+.not_pargb_WB1_ND:
 
     test      eax, eax                   ; end of offset list ?
-    jz        $clip_setup_end_WB1_ND     ; yup
+    jz        .clip_setup_end_WB1_ND     ; yup
 
     ;; here:  eax = offset into vertex
     ;;        ebx = vertex
@@ -3210,14 +3200,14 @@ $not_pargb_WB1_ND:
     ;;        mm7 = write buffer
     ;; avail: edx, mm0, mm3, mm4, mm5, mm6
 
-$inner_loop_WB1_ND:
+.inner_loop_WB1_ND:
     mov       edx, [gc+dataElem+tsuDatLstScal]; iscaler
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
     movd      mm4, [vertex + eax]        ; TRI_SETF(FARRAY(vPtr, i)
-    movd      mm6, atab[edx*4]           ; atable [iscaler]
+    movd      mm6, [atab + edx*4]           ; atable [iscaler]
 
-    movd      mm5, btab[edx*4]           ; btable [iscaler]
+    movd      mm5, [btab + edx*4]           ; btable [iscaler]
     pfmul     mm4, mm6                   ; TRI_SETF(FARRAY(vPtr, i)*atable[iscaler]
 
     mov       eax, [gc+dataElem+tsuDataList+4]; next offset from offset list
@@ -3229,22 +3219,22 @@ $inner_loop_WB1_ND:
     mov       edx, [gc+dataElem+tsuDatLstScal+4]; next iscaler
     movq      [fifo-8], mm7              ; PCI write  new param | previous param
 
-    jz        $clip_setup_end_WB0_ND     ; yup, end of offset list, this vertex done
+    jz        .clip_setup_end_WB0_ND     ; yup, end of offset list, this vertex done
     add       dataElem, 8                ; dataElem++
 
     movd      mm7, [vertex+eax]          ; TRI_SETF(FARRAY(vPtr, i)
-    movd      mm6, atab[edx*4]           ; atable [iscaler]
+    movd      mm6, [atab + edx*4]           ; atable [iscaler]
 
-    movd      mm5, btab[edx*4]           ; btable [iscaler]
+    movd      mm5, [btab + edx*4]           ; btable [iscaler]
     pfmul     mm7, mm6                   ; TRI_SETF(FARRAY(vPtr, i)*atable[iscaler]
 
     mov       eax, [gc+dataElem+tsuDataList]; next offset from offset list
     pfadd     mm7, mm5                   ; TRI_SETF(FARRAY(vPtr, i)*atable[iscaler]+btable [iscaler]
 
     cmp       eax, 0                     ; offset == 0 (end of offset list) ?
-    jnz       $inner_loop_WB1_ND         ; nope, get next component (1 DWORD in "write buffer")
+    jnz       .inner_loop_WB1_ND         ; nope, get next component (1 DWORD in "write buffer")
 
-$clip_setup_end_WB1_ND:
+.clip_setup_end_WB1_ND:
 
     mov       eax, [strideinbytes]       ; offset to next vertex
     nop                                  ; filler
@@ -3253,15 +3243,15 @@ $clip_setup_end_WB1_ND:
     dec       vertexCount                ; one less vertex to handle
 
     lea       vertex, [vertex + eax]     ; points to next vertex
-    jnz       $clip_for_begin_WB1_ND     ; until all vertices done; 1 DWORD in "write buffer" = MM7
+    jnz       .clip_for_begin_WB1_ND     ; until all vertices done; 1 DWORD in "write buffer" = MM7
 
     movd      [fifo], mm7                ; flush "write buffer"
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
 
     mov       esp, esp                   ; filler
-    jmp       $clip_setup_done_ND        ; all vertices handled, tri strip done
+    jmp       .clip_setup_done_ND        ; all vertices handled, tri strip done
 
-$clip_fifo_aligned_ND:                       
+.clip_fifo_aligned_ND:                       
 
     movd      mm7, eax                   ; write buffer has 1 DWORD now
 
@@ -3275,7 +3265,7 @@ $clip_fifo_aligned_ND:
 ;;;       TRI_SETF(FARRAY(vPtr, 4)
 ;;;         *oow*gc->state.Viewport.hheight + gc->state.Viewport.oy)
   
-$clip_for_begin_WB1_ND:
+.clip_for_begin_WB1_ND:
 
     ;; here:  ebx = vertex
     ;;        ecx = fifo
@@ -3318,7 +3308,7 @@ $clip_for_begin_WB1_ND:
     movd      [atab+4], mm6              ; atable[1] = oow*gc->state.Viewport.hdepth
 
     pfadd     mm5, mm1                   ; TRI_SETF(FARRAY(vPtr, 4)*oow*gc->state.Viewport.hheight + gc->state.Viewport.oy) |
-    movq      QWORD PTR [atab+12], mm3   ; atable[4] = oow * gc->state.tmu_config[0].t_scale | atable[3] = oow * gc->state.tmu_config[0].s_scale
+    movq      QWORD [atab+12], mm3   ; atable[4] = oow * gc->state.tmu_config[0].t_scale | atable[3] = oow * gc->state.tmu_config[0].s_scale
  
     movq      mm3, [gc + tmu1_s_scale]   ; gc->state.tmu_config[1].t_scale | gc->state.tmu_config[1].s_scale
     punpckldq mm7, mm5                   ; header | transformed x
@@ -3329,8 +3319,8 @@ $clip_for_begin_WB1_ND:
     pfmul     mm3, mm0                   ; oow*gc->state.tmu_config[1].t_scale | oow*gc->state.tmu_config[1].s_scale
     movq      mm7, mm5                   ; 1 DWORD in "write buffer" (transformed y)
 
-    movq      QWORD PTR [atab+20], mm3   ; atable[6] = oow * gc->state.tmu_config[1].t_scale | oow * gc->state.tmu_config[1].s_scale
-    jne       $not_pargb_WB1_ND          ; nope, gc->state.vData.colorType != GR_U8
+    movq      qword [atab+20], mm3   ; atable[6] = oow * gc->state.tmu_config[1].t_scale | oow * gc->state.tmu_config[1].s_scale
+    jne       .not_pargb_WB1_ND          ; nope, gc->state.vData.colorType != GR_U8
 
     movd      mm6, [vertex + eax]        ; get packed ARGB data;
     mov       dataElem, 4                ; dataElem = 1 (namely pargb)
@@ -3341,9 +3331,9 @@ $clip_for_begin_WB1_ND:
     movq      [fifo], mm7                ; PCI write pargb | transformed y
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
-$not_pargb_WB0_ND:
+.not_pargb_WB0_ND:
     cmp       eax, 0                     ; end of data offset list ?
-    jz        $clip_setup_end_WB0_ND     ; yup, this vertex done
+    jz        .clip_setup_end_WB0_ND     ; yup, this vertex done
 
     ;; here:  eax = offset into vertex
     ;;        ebx = vertex
@@ -3356,13 +3346,13 @@ $not_pargb_WB0_ND:
     ;;        mm3 = F256_F256
     ;; avail: edx, mm0, mm4, mm5, mm6, mm7
 
-$inner_loop_WB0_ND:
+.inner_loop_WB0_ND:
 
     mov       edx, [gc+dataElem+tsuDatLstScal]; iscaler
     movd      mm7, [vertex + eax]        ; TRI_SETF(FARRAY(vPtr, i)
 
-    movd      mm6, atab[edx*4]           ; atable[iscaler]
-    movd      mm5, btab[edx*4]           ; btable[iscaler]
+    movd      mm6, [atab + edx*4]           ; atable[iscaler]
+    movd      mm5, [btab + edx*4]           ; btable[iscaler]
 
     pfmul     mm7, mm6                   ; TRI_SETF(FARRAY(vPtr, i)*atable[iscaler]
     mov       eax, [gc+dataElem+tsuDataList+4] ; next offset from offset list
@@ -3371,12 +3361,12 @@ $inner_loop_WB0_ND:
     mov       edx, [gc+dataElem+tsuDatLstScal+4]; next iscaler
 
     cmp       eax, 0                     ; offset == 0 (end of offset list) ?
-    jz        $clip_setup_end_WB1_ND     ; yup, vertex done, 1 DWORD in "write buffer"
+    jz        .clip_setup_end_WB1_ND     ; yup, vertex done, 1 DWORD in "write buffer"
 
     movd      mm4, [vertex + eax]        ; TRI_SETF(FARRAY(vPtr, i)
-    movd      mm6, atab[edx*4]           ; atable[iscaler]
+    movd      mm6, [atab + edx*4]           ; atable[iscaler]
 
-    movd      mm5, btab[edx*4]           ; btable[iscaler]
+    movd      mm5, [btab + edx*4]           ; btable[iscaler]
     add       dataElem, 8                ; dataElem += 2
 
     add       fifo, 8                    ; fifoPtr += 2*sizeof*FxFloat)
@@ -3392,9 +3382,9 @@ $inner_loop_WB0_ND:
     nop                                  ; filler
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jnz       $inner_loop_WB0_ND         ; nope, vertex not done yet; "write buffer" empty
+    jnz       .inner_loop_WB0_ND         ; nope, vertex not done yet; "write buffer" empty
 
-$clip_setup_end_WB0_ND:
+.clip_setup_end_WB0_ND:
 
     mov       eax, [strideinbytes]       ; offset to next vertex
     mov       esp, esp                   ; filler
@@ -3403,9 +3393,9 @@ $clip_setup_end_WB0_ND:
     sub       vertexCount, 1             ; one less vertex to process, any left?
 
     lea       vertex, [vertex + eax]     ; points to next vertex
-    jnz       $clip_for_begin_WB0_ND     ; yup, vertices left, push out next vertex
+    jnz       .clip_for_begin_WB0_ND     ; yup, vertices left, push out next vertex
 
-$clip_setup_done_ND:
+.clip_setup_done_ND:
     mov       eax, [gc + fifoPtr]        ; old fifoPtr
     mov       ebp, [gc + fifoRoom]       ; old number of bytes available in fifo
 
@@ -3422,7 +3412,7 @@ $clip_setup_done_ND:
     cmp       vertexCount, 0             ; any vertices left to process ?
 
     nop                                  ; filler
-    jg        $clip_coords_begin_ND      ; loop if number of vertices to process >= 0
+    jg        .clip_coords_begin_ND      ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state
 
@@ -3439,15 +3429,15 @@ $clip_setup_done_ND:
 
     nop                                  ; filler for code alignment
 
-$clip_coordinates_D:
+.clip_coordinates_D:
 
     pxor      mm0, mm0                   ; load 0
-    movd      mm1, [__GlideRoot+pool_f255]; GlideRoot.pool.f255 
+    movd      mm1, [_GlideRoot+pool_f255]; GlideRoot.pool.f255
 
     movd      [atab], mm0                ; atable[0] = 0.0f
     movd      mm5, [gc + vp_oz]          ; gc->state.Viewport.oz
 
-    movq      QWORD PTR [btab+8], mm0    ; btable[3] = 0.0f | btable[2] = 0.0f
+    movq      QWORD [btab+8], mm0    ; btable[3] = 0.0f | btable[2] = 0.0f
     movq      mm2, [gc + vp_hwidth]      ; gc->state.Viewport.hheight | gc->state.Viewport.hwidth
 
     movd      [atab+8], mm1              ; atable[2] = GlideRoot.pool.f255
@@ -3456,10 +3446,10 @@ $clip_coordinates_D:
     movd      [btab+4], mm5              ; btable[1] = gc->state.Viewport.oz
     nop                                  ; filler
 
-    movq      QWORD PTR [btab+16], mm0   ; btable[5] = 0.0f | btable[4] = 0.0f
-    movq      QWORD PTR [btab+24], mm0   ; btable[7] = 0.0f | btable[6] = 0.0f
+    movq      QWORD [btab+16], mm0   ; btable[5] = 0.0f | btable[4] = 0.0f
+    movq      QWORD [btab+24], mm0   ; btable[7] = 0.0f | btable[6] = 0.0f
 
-$clip_coords_begin_D:
+.clip_coords_begin_D:
 
     sub       vertexCount, 15            ; vertexCount >= 15 ? CF=0 : CF=1
     mov       ecx, [gc + vertexSize]     ; bytes of data for each vertex 
@@ -3475,18 +3465,18 @@ $clip_coords_begin_D:
     add       ecx, 4                     ; add header size ==> total packet size
 
     cmp       eax, ecx                   ; fifo space avail >= packet size ?
-    jge       $clip_tri_begin_D          ; yup, start writing triangle data
+    jge       .clip_tri_begin_D          ; yup, start writing triangle data
 
-    push      @Line                      ; line number inside this function
+    push      __LINE__                   ; line number inside this function
     push      0h                         ; pointer to function name = NULL
 
     push      ecx                        ; fifo space needed
-    call      __FifoMakeRoom             ; note: updates fifoPtr
+    call      _FifoMakeRoom             ; note: updates fifoPtr
 
     add       esp, 12                    ; remove 3 DWORD arguments from stack
     nop                                  ; filler
 
-$clip_tri_begin_D:
+.clip_tri_begin_D:
 
     mov       edx, vertexCount           ; number of vertices in triangles
     mov       fifo, [gc + fifoPtr]       ; get fifoPtr
@@ -3500,12 +3490,12 @@ $clip_tri_begin_D:
     test      fifo, ebp                  ; fifoPtr QWORD aligned ?
 
     mov       eax, [gc + wInfo_offset]   ; gc->state.vData.wInfo.offset
-    jz        $clip_fifo_aligned_D       ; yup
+    jz        .clip_fifo_aligned_D       ; yup
 
     mov       [fifo], edx                ; PCI write packet type
     add       fifo, 4                    ; fifo pointer now QWORD aligned
 
-$clip_for_begin_WB0_D :                  ; "write buffer" = MM7 is empty
+.clip_for_begin_WB0_D :                  ; "write buffer" = MM7 is empty
 
     ;; here:  eax = gc->state.vData.wInfo.offset
     ;;        ebx = vertexPtr
@@ -3556,11 +3546,11 @@ $clip_for_begin_WB0_D :                  ; "write buffer" = MM7 is empty
     movd      [atab+4], mm6              ; atable[1] = oow*gc->state.Viewport.hdepth
     pfmul     mm4, mm0                   ; oow * gc->state.tmu_config[1].t_scale | oow * gc->state.tmu_config[1].s_scale
 
-    movq      QWORD PTR [atab+12], mm3   ; atable[4] = oow * gc->state.tmu_config[0].t_scale | atable[3] = oow * gc->state.tmu_config[0].s_scale 
+    movq      qword [atab+12], mm3   ; atable[4] = oow * gc->state.tmu_config[0].t_scale | atable[3] = oow * gc->state.tmu_config[0].s_scale 
     movq      [fifo-8], mm7              ; PCI write transformed x, y; write buffer=mm7 empty
 
-    movq      QWORD PTR [atab+20], mm4   ; atable[6] = oow * gc->state.tmu_config[1].t_scale | atable[5] = oow * gc->state.tmu_config[1].s_scale
-    jne       $not_pargb_WB0_D           ; nope, gc->state.vData.colorType != GR_U8
+    movq      qword [atab+20], mm4   ; atable[6] = oow * gc->state.tmu_config[1].t_scale | atable[5] = oow * gc->state.tmu_config[1].s_scale
+    jne       .not_pargb_WB0_D           ; nope, gc->state.vData.colorType != GR_U8
 
     movd      mm7, [edx + eax]           ; get packed ARGB data; 1 DWORD in "write buffer"=mm7
     nop                                  ; filler
@@ -3568,10 +3558,10 @@ $clip_for_begin_WB0_D :                  ; "write buffer" = MM7 is empty
     mov       eax, [gc + tsuDataList + 4]; get offset of next vertex component (after pargb)
     add       dataElem, 4                ; dataElem = 1 (namely pargb)
 
-$not_pargb_WB1_D:
+.not_pargb_WB1_D:
 
     test      eax, eax                   ; end of offset list ?
-    jz        $clip_setup_end_WB1_D      ; yup
+    jz        .clip_setup_end_WB1_D      ; yup
 
     ;; here:  eax = offset into vertex
     ;;        ebx = vertexPtr
@@ -3584,14 +3574,14 @@ $not_pargb_WB1_D:
     ;;        mm7 = write buffer
     ;; avail: eax, esi, mm0, mm3, mm4, mm5, mm6
 
-$inner_loop_WB1_D:
+.inner_loop_WB1_D:
     mov       esi, [gc+dataElem+tsuDatLstScal]; iscaler
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
     movd      mm4, [edx + eax]           ; TRI_SETF(FARRAY(vPtr, i)
-    movd      mm6, atab[esi*4]           ; atable [iscaler]
+    movd      mm6, [atab + esi*4]           ; atable [iscaler]
 
-    movd      mm5, btab[esi*4]           ; btable [iscaler]
+    movd      mm5, [btab + esi*4]           ; btable [iscaler]
     pfmul     mm4, mm6                   ; TRI_SETF(FARRAY(vPtr, i)*atable[iscaler]
 
     mov       eax, [gc+dataElem+tsuDataList+4]; next offset from offset list
@@ -3603,13 +3593,13 @@ $inner_loop_WB1_D:
     mov       esi, [gc+dataElem+tsuDatLstScal+4]; next iscaler
     movq      [fifo-8], mm7              ; PCI write  new param | previous param
 
-    jz        $clip_setup_end_WB0_D      ; yup, end of offset list, this vertex done
+    jz        .clip_setup_end_WB0_D      ; yup, end of offset list, this vertex done
     movd      mm7, [edx + eax]           ; TRI_SETF(FARRAY(vPtr, i)
 
     add       dataElem, 8                ; dataElem += 2
-    movd      mm6, atab[esi*4]           ; atable [iscaler]
+    movd      mm6, [atab + esi*4]           ; atable [iscaler]
 
-    movd      mm5, btab[esi*4]           ; btable [iscaler]
+    movd      mm5, [btab + esi*4]           ; btable [iscaler]
     pfmul     mm7, mm6                   ; TRI_SETF(FARRAY(vPtr, i)*atable[iscaler]
 
     mov       eax, [gc+dataElem+tsuDataList]; next offset from offset list
@@ -3618,27 +3608,27 @@ $inner_loop_WB1_D:
     nop                                  ; filler
     test      eax, eax                   ; offset == 0 (end of offset list) ?
 
-    jnz       $inner_loop_WB1_D          ; nope, get next component (1 DWORD in "write buffer")
+    jnz       .inner_loop_WB1_D          ; nope, get next component (1 DWORD in "write buffer")
     nop                                  ; filler
 
-$clip_setup_end_WB1_D:
+.clip_setup_end_WB1_D:
     mov       vertexCount, [vertices]    ; get back number of vertices left to process
     mov       eax, [gc + wInfo_offset]   ; gc->state.vData.wInfo.offset
 
     dec       vertexCount                ; one less vertex to handle
-    jnz       $clip_for_begin_WB1_D      ; until all vertices done; 1 DWORD in "write buffer" = MM7
+    jnz       .clip_for_begin_WB1_D      ; until all vertices done; 1 DWORD in "write buffer" = MM7
 
     movd      [fifo], mm7                ; flush "write buffer"
     add       fifo, 4                    ; fifoPtr += sizeof(FxFloat)
 
-    jmp       $clip_setup_done_D         ; all vertices handled, triangles done
+    jmp       .clip_setup_done_D         ; all vertices handled, triangles done
     nop                                  ; filler
 
-$clip_fifo_aligned_D:                       
+.clip_fifo_aligned_D:                       
 
     movd      mm7, edx                   ; write buffer has 1 DWORD now
 
-$clip_for_begin_WB1_D:
+.clip_for_begin_WB1_D:
 
     ;; here:  eax = gc->state.vData.wInfo.offset
     ;;        ebx = vertexPtr
@@ -3686,7 +3676,7 @@ $clip_for_begin_WB1_D:
     pfadd     mm5, mm1                   ; TRI_SETF(FARRAY(vPtr, 4)*oow*gc->state.Viewport.hheight + gc->state.Viewport.oy) |
     movd      [atab+4], mm6              ; atable[1] = oow*gc->state.Viewport.hdepth
 
-    movq      QWORD PTR [atab+12], mm3   ; atable[4] = oow * gc->state.tmu_config[0].t_scale | atable[3] = oow * gc->state.tmu_config[0].s_scale
+    movq      qword [atab+12], mm3   ; atable[4] = oow * gc->state.tmu_config[0].t_scale | atable[3] = oow * gc->state.tmu_config[0].s_scale
     movq      mm3, [gc + tmu1_s_scale]   ; gc->state.tmu_config[1].t_scale | gc->state.tmu_config[1].s_scale
 
     punpckldq mm7, mm5                   ; header | transformed x
@@ -3698,8 +3688,8 @@ $clip_for_begin_WB1_D:
     movq      mm7, mm5                   ; 1 DWORD in "write buffer" (transformed y)
     nop                                  ; filler
 
-    movq      QWORD PTR [atab+20], mm3   ; atable[6] = oow * gc->state.tmu_config[1].t_scale | atable[5] = oow * gc->state.tmu_config[1].s_scale
-    jne       $not_pargb_WB1_D           ; nope, gc->state.vData.colorType != GR_U8
+    movq      qword [atab+20], mm3   ; atable[6] = oow * gc->state.tmu_config[1].t_scale | atable[5] = oow * gc->state.tmu_config[1].s_scale
+    jne       .not_pargb_WB1_D           ; nope, gc->state.vData.colorType != GR_U8
 
     movd      mm6, [edx + eax]           ; get packed ARGB data;
     mov       dataElem, 4                ; dataElem = 1 (namely pargb)
@@ -3710,9 +3700,9 @@ $clip_for_begin_WB1_D:
     movq      [fifo], mm7                ; PCI write pargb | transformed y
     add       fifo, 8                    ; fifoPtr += 2*sizeof(FxFloat)
 
-$not_pargb_WB0_D:
+.not_pargb_WB0_D:
     test      eax, eax                   ; end of data offset list ?
-    jz        $clip_setup_end_WB0_D      ; yup, this vertex done
+    jz        .clip_setup_end_WB0_D      ; yup, this vertex done
 
     ;; here:  eax = offset into vertex
     ;;        ebx = vertexPtr
@@ -3724,12 +3714,12 @@ $not_pargb_WB0_D:
     ;;        mm2 = gc->state.Viewport.hheight|gc->state.Viewport.hwidth
     ;; avail: eax, esi, mm0, mm3, mm4, mm5, mm6, mm7
 
-$inner_loop_WB0_D:
+.inner_loop_WB0_D:
     mov       esi, [gc+dataElem+tsuDatLstScal]; iscaler
     movd      mm7, [edx+eax]             ; TRI_SETF(FARRAY(vPtr, i)
 
-    movd      mm6, atab[esi*4]           ; atable[iscaler]
-    movd      mm5, btab[esi*4]           ; btable[iscaler]
+    movd      mm6, [atab + esi*4]           ; atable[iscaler]
+    movd      mm5, [btab + esi*4]           ; btable[iscaler]
 
     pfmul     mm7, mm6                   ; TRI_SETF(FARRAY(vPtr, i)*atable[iscaler]
     mov       eax, [gc+dataElem+tsuDataList+4]
@@ -3738,12 +3728,12 @@ $inner_loop_WB0_D:
     mov       esi, [gc+dataElem+tsuDatLstScal+4]; next iscaler
 
     test      eax, eax                   ; offset == 0 (end of offset list) ?
-    jz        $clip_setup_end_WB1_D      ; yup, vertex done, 1 DWORD in "write buffer"
+    jz        .clip_setup_end_WB1_D      ; yup, vertex done, 1 DWORD in "write buffer"
 
     movd      mm4, [edx+eax]             ; TRI_SETF(FARRAY(vPtr, i)
-    movd      mm6, atab[esi*4]           ; atable[iscaler]
+    movd      mm6, [atab + esi*4]           ; atable[iscaler]
 
-    movd      mm5, btab[esi*4]           ; btable[iscaler]
+    movd      mm5, [btab + esi*4]           ; btable[iscaler]
     add       dataElem, 8                ; dataElem += 2
 
     add       fifo, 8                    ; fifoPtr += 2*sizeof*FxFloat)
@@ -3756,16 +3746,16 @@ $inner_loop_WB0_D:
     punpckldq mm7, mm4                   ; current component | previous component
 
     movq      [fifo-8], mm7              ; PCI write current component | previous component
-    jnz       $inner_loop_WB0_D          ; nope, vertex not done yet; "write buffer" empty
+    jnz       .inner_loop_WB0_D          ; nope, vertex not done yet; "write buffer" empty
 
-$clip_setup_end_WB0_D:
+.clip_setup_end_WB0_D:
     mov       vertexCount, [vertices]    ; get back number of vertices left to process
     mov       eax, [gc + wInfo_offset]   ; gc->state.vData.wInfo.offset
 
     dec       vertexCount                ; one less vertex to process, any left?
-    jnz       $clip_for_begin_WB0_D      ; yup, vertices left, push out next vertex
+    jnz       .clip_for_begin_WB0_D      ; yup, vertices left, push out next vertex
 
-$clip_setup_done_D:
+.clip_setup_done_D:
     mov       eax, [gc + fifoPtr]        ; old fifoPtr
     mov       ebp, [gc + fifoRoom]       ; old number of bytes available in fifo
 
@@ -3782,13 +3772,13 @@ $clip_setup_done_D:
     mov       [esp + _count], vertexCount; remaining number of vertices to process 
 
     cmp       vertexCount, 0             ; any vertices left to process
-    jg        $clip_coords_begin_D       ; loop if number of vertices to process >= 0
+    jg        .clip_coords_begin_D       ; loop if number of vertices to process >= 0
 
     femms                                ; no more MMX code; clear MMX/FPU state
 
-endif ; GLIDE3_SCALER
+%endif ; GLIDE3_SCALER
 
-$tris_done:
+.tris_done:
     pop       ebp                        ; restore frame pointer
     pop       ebx                        ; restore caller's register variable
 
@@ -3796,40 +3786,33 @@ $tris_done:
     pop       edi                        ; restore caller's register variable
 
     ret       12                         ; return, pop 3 DWORD parameters
-__grDrawTriangles_3DNow@12 ENDP        
-
-_TEXT   ENDS
+endp
 
 ;;--------------------------------------------------------------------------
 ;; end AMD3D version
 ;;--------------------------------------------------------------------------
-endif ; GL_AMD3D
+%endif ; GL_AMD3D
 
 ;;--------------------------------------------------------------------------
 ;; start original code
 ;;--------------------------------------------------------------------------
 
-ifndef GL_AMD3D
+%ifndef GL_AMD3D
 
-TITLE   xdraw3.asm
-.586P
 ;;; include listing.inc
-INCLUDE fxgasm.h
+%INCLUDE "fxgasm.h"
 
-.model FLAT,C                   ; Flat memory, mangle publics with leading '_'
+extrn   _GlideRoot
+extrn   _FifoMakeRoom
 
-EXTRN   _GlideRoot:DWORD
-EXTRN   _FifoMakeRoom:NEAR
+segment		CONST
+_F1    DD      1.0
+_F256  DD      256.0
 
-CONST   SEGMENT
-_F1    DD      03f800000r                      ; 1
-_F256  DD      043800000r                      ; 256
+_VPF1    DD      1.0
+_VPF256  DD      256.0
 
-_VPF1    DD      03f800000r                      ; 1
-_VPF256  DD      043800000r                      ; 256    
-CONST   ENDS
-
-_DATA   SEGMENT
+segment		DATA
 vSize           DD    0
 ccoow           DD    0
 packetVal       DD    0
@@ -3851,22 +3834,21 @@ yb              DD    0
 xc              DD    0
 yc              DD    0
 zArea           DD    0
-_DATA    ENDS
 
-_TEXT   SEGMENT
-_pktype = 20
-_type = 24
-_mode = 28
-_count = 32
-_pointers = 36
+segment		TEXT
+%define _pktype 20
+%define _type 24
+%define _mode 28
+%define _count 32
+%define _pointers 36
 
-gc                 TEXTEQU     <esi>       ; points to graphics context
-fifo               TEXTEQU     <ecx>       ; points to next entry in fifo
-dlp                TEXTEQU     <ebp>       ; points to dataList structure
-vertexCount        TEXTEQU     <ebx>       ; Current vertex counter in the packet
-vertexPtr          TEXTEQU     <edi>       ; Current vertex pointer
+%define gc                 esi       ; points to graphics context
+%define fifo               ecx       ; points to next entry in fifo
+%define dlp                ebp       ; points to dataList structure
+%define vertexCount        ebx       ; Current vertex counter in the packet
+%define vertexPtr          edi       ; Current vertex pointer
 
-_drawvertexlist@20 PROC NEAR
+proc _drawvertexlist, 20
 ; 132  : {
 
         push    esi
@@ -3879,24 +3861,24 @@ _drawvertexlist@20 PROC NEAR
 ;;;     if (stride == 0)
 ;;;       stride = gc->state.vData.vStride;
             
-        mov     gc, DWORD PTR _GlideRoot+curGC
-        mov     ecx, DWORD PTR [gc+vertexSize]
+        mov     gc, dword [_GlideRoot+curGC]
+        mov     ecx, dword [gc+vertexSize]
     
-        mov     edx, DWORD PTR [esp+_mode]
-        mov     vertexCount, DWORD PTR [esp+_count]
+        mov     edx, dword [esp+_mode]
+        mov     vertexCount, dword [esp+_count]
     
-        mov     vertexPtr, DWORD PTR [esp+_pointers]
-        mov     DWORD PTR vSize, ecx
+        mov     vertexPtr, dword [esp+_pointers]
+        mov     dword [vSize], ecx
 
         shl     edx, 2
 ;;;     mov     ecx, DWORD PTR [gc+CoordinateSpace]
           test    edx, edx
-        jne     SHORT no_stride
-        mov     edx, DWORD PTR [gc+vertexStride]
+        jne     .no_stride
+        mov     edx, dword [gc+vertexStride]
         shl     edx, 2
 
         align 4
-no_stride:
+.no_stride:
 
 ;;;     Draw the first (or possibly only) set.  This is necessary because
 ;;;     the packet is 3_BDDDDDD, and in the next set, the packet is 3_DDDDDD
@@ -3906,59 +3888,59 @@ no_stride:
 ;;;     if (gc->state.grCoordinateSpaceArgs.coordinate_space_mode == GR_WINDOW_COORDS) {
 
 ;;;     test    ecx, ecx
-          mov     DWORD PTR strideinbytes, edx
+          mov     dword [strideinbytes], edx
 
 ;;;       while (count > 0) {
 ;;;         FxI32 k, vcount = count >= 15 ? 15 : count;
 ;;;         GR_SET_EXPECTED_SIZE(vcount * vSize, 1);
 ;;;         TRI_STRIP_BEGIN(type, vcount, vSize, pktype);
 
-        mov     eax, DWORD PTR [esp+_count]
+        mov     eax, dword [esp+_count]
 ;;;       jne     clip_coordinates
 
         test    eax, eax
-          jle     strip_done
+          jle     .strip_done
 
         align 4
-window_coords_begin:
+.window_coords_begin:
 
         cmp     vertexCount, 15                 ; 0000000fH
-          jl      SHORT win_partial_packet
+          jl      .win_partial_packet
         mov     vertexCount, 15                 ; 0000000fH
 
         align 4
-win_partial_packet:
+.win_partial_packet:
 
-        mov     eax, DWORD PTR vSize
-        mov     ecx, DWORD PTR [gc+fifoRoom]
+        mov     eax, dword [vSize]
+        mov     ecx, dword [gc+fifoRoom]
         imul    eax, vertexCount
         add     eax, 4
         cmp     ecx, eax
-        jge     SHORT win_strip_begin
-        push    @Line
+        jge     .win_strip_begin
+        push    __LINE__
         push    0h
         push    eax
         call    _FifoMakeRoom
         add     esp, 12                 ; 0000000cH
         
         align 4
-win_strip_begin:
+.win_strip_begin:
 
 ;;;     Setup pacet header
 ;;;
-        mov     fifo, DWORD PTR [gc+fifoPtr]
+        mov     fifo, dword [gc+fifoPtr]
           mov     eax, vertexCount
-        mov     edx, DWORD PTR [esp+_type]
-          mov     ebp, DWORD PTR [gc+cullStripHdr]
+        mov     edx, dword [esp+_type]
+          mov     ebp, dword [gc+cullStripHdr]
         shl     edx, 22                 ; 00000010H
           add     fifo, 4
         shl     eax, 6
           or    ebp, edx
         or      eax, ebp
-          mov     edx, DWORD PTR [esp+_pktype]
+          mov     edx, dword [esp+_pktype]
         or      eax, edx
           nop
-        mov     DWORD PTR [fifo-4], eax
+        mov     dword [fifo-4], eax
 
 ;;;     for (k = 0; k < vcount; k++) {
 ;;;       FxI32 i;
@@ -3974,31 +3956,31 @@ win_strip_begin:
 ;;;       i = gc->tsuDataList[dataElem];
 
         align 4
-win_for_begin:
+.win_for_begin:
 
         mov     edx, vertexPtr
-          mov     eax, DWORD PTR strideinbytes
+          mov     eax, dword [strideinbytes]
         cmp     eax, 4
-          jne     SHORT win_no_deref
-        mov     edx, DWORD PTR [vertexPtr]
+          jne     .win_no_deref
+        mov     edx, dword [vertexPtr]
 
         align 4
-win_no_deref:
+.win_no_deref:
 
         add     fifo, 8
           add     vertexPtr, eax
 
-        mov     eax, DWORD PTR [edx]
-          mov     ebp, DWORD PTR [edx+4]
+        mov     eax, dword [edx]
+          mov     ebp, dword [edx+4]
         
-        mov     DWORD PTR [fifo-8], eax
-          mov     eax, DWORD PTR [gc+tsuDataList]
+        mov     dword [fifo-8], eax
+          mov     eax, dword [gc+tsuDataList]
 
-        mov     DWORD PTR [fifo-4], ebp
+        mov     dword [fifo-4], ebp
   
           test    eax, eax
-        lea     dlp, DWORD PTR [gc+tsuDataList]
-          je      SHORT win_datalist_end
+        lea     dlp, [gc+tsuDataList]
+          je      .win_datalist_end
   
         align 4
 
@@ -4008,25 +3990,25 @@ win_no_deref:
 ;;;         i = gc->tsuDataList[dataElem];
 ;;;       }
 
-win_datalist_begin:
+.win_datalist_begin:
 
         add     fifo, 4
           add     dlp, 4
 
-        mov     eax, DWORD PTR [edx+eax]
+        mov     eax, dword [edx+eax]
           nop
 
-        mov     DWORD PTR [fifo-4], eax
-          mov     eax, DWORD PTR [dlp]
+        mov     dword [fifo-4], eax
+          mov     eax, dword [dlp]
 
 
         test    eax, eax
-          jne     SHORT win_datalist_begin
-win_datalist_end:
+          jne     .win_datalist_begin
+.win_datalist_end:
 
         dec     vertexCount
-          jne     SHORT win_for_begin
-win_for_end:
+          jne     .win_for_begin
+.win_for_end:
 
 ;;;       TRI_END;
 ;;;     Prepare for the next packet (if the strip size is longer than 15)
@@ -4035,488 +4017,481 @@ win_for_end:
 ;;;       pktype = SSTCP_PKT3_DDDDDD;
 ;;;     }
   
-        mov     eax, DWORD PTR [gc+fifoPtr]
-          mov     edx, DWORD PTR [gc+fifoRoom]
+        mov     eax, dword [gc+fifoPtr]
+          mov     edx, dword [gc+fifoRoom]
         sub     eax, fifo
-          mov     vertexCount, DWORD PTR [esp+_count]
+          mov     vertexCount, dword [esp+_count]
         add     edx, eax
           sub     vertexCount, 15                 ; 0000000fH
   
-        mov     DWORD PTR [gc+fifoRoom], edx
-          mov     DWORD PTR [esp+_count], vertexCount
+        mov     dword [gc+fifoRoom], edx
+          mov     dword [esp+_count], vertexCount
   
-        mov     DWORD PTR [gc+fifoPtr], fifo
+        mov     dword [gc+fifoPtr], fifo
           test    vertexCount, vertexCount
   
-        mov     DWORD PTR [esp+_pktype], 16 ; 00000010H
-          jg      window_coords_begin
+        mov     dword [esp+_pktype], 16 ; 00000010H
+          jg      .window_coords_begin
 
-strip_done:
+.strip_done:
         pop     ebp
           pop     ebx
         pop     edi
           pop     esi
-        ret     20                      ; 00000014H
+        ret                             ; 00000014H
 
-_drawvertexlist@20 ENDP
+endp
 
-_pktype = 20
-_type = 24
-_mode = 28
-_count = 32
-_pointers = 36
+%define gc                 esi       ; points to graphics context
+%define fifo               ecx       ; points to next entry in fifo
+%define vertexPtr          edx       ; pointer to vertex or vertex array
 
-gc                 TEXTEQU     <esi>       ; points to graphics context
-fifo               TEXTEQU     <ecx>       ; points to next entry in fifo
-vertexPtr          TEXTEQU     <edx>       ; pointer to vertex or vertex array
-
-_vpdrawvertexlist@20 PROC NEAR
+proc _vpdrawvertexlist, 20
 
         push    esi
         push    edi
 
         push    ebx        
-        mov     gc, DWORD PTR _GlideRoot+curGC
+        mov     gc, dword [_GlideRoot+curGC]
         
         push    ebp
-        mov     ecx, DWORD PTR [esp+_mode]
+        mov     ecx, dword [esp+_mode]
         
-        mov     edi, DWORD PTR [esp+_pointers]
-        mov     eax, DWORD PTR [gc+wInfo_offset]
+        mov     edi, dword [esp+_pointers]
+        mov     eax, dword [gc+wInfo_offset]
     
         test    ecx, ecx
-        je      w_no_dref
+        je      .w_no_dref
     
-        mov     edi, DWORD PTR [edi]
+        mov     edi, dword [edi]
         
         align   4
-w_no_dref:
+.w_no_dref:
         
 ;;;     load first w
         
-        fld     DWORD PTR [edi+eax]
-        fdivr   DWORD PTR _F1
+        fld     dword [edi+eax]
+        fdivr   dword [_F1]
         
-        mov     ecx, DWORD PTR [gc+vertexSize]
-        mov     edx, DWORD PTR [esp+_mode]
+        mov     ecx, dword [gc+vertexSize]
+        mov     edx, dword [esp+_mode]
 
-        mov     edi, DWORD PTR [esp+_count]
+        mov     edi, dword [esp+_count]
 ;;;     mov     vertexArray, DWORD PTR [esp+_pointers]
 
         shl     edx, 2
-        mov     DWORD PTR vSize, ecx
+        mov     dword [vSize], ecx
 
         test    edx, edx
 
-        jne     SHORT no_stride
+        jne     .no_stride
 
-        mov     edx, DWORD PTR [gc+vertexStride]
+        mov     edx, dword [gc+vertexStride]
         shl     edx, 2
 
         align 4
-no_stride:
+.no_stride:
 
-        mov     DWORD PTR strideinbytes, edx
-        mov     eax, DWORD PTR [esp+_type]
+        mov     dword [strideinbytes], edx
+        mov     eax, dword [esp+_type]
 
         shl     eax, 16                 ; 00000010H
-        mov     DWORD PTR packetVal, eax
+        mov     dword [packetVal], eax
 
-clip_coords_begin:
+.clip_coords_begin:
 
         cmp     edi, 15
-        jl      SHORT clip_partial_packet
+        jl      .clip_partial_packet
         mov     edi, 15
-clip_partial_packet:
+.clip_partial_packet:
 
 ;;;     GR_SET_EXPECTED_SIZE(vcount * vSize, 1)
 
-        mov     eax, DWORD PTR vSize
-        mov     ecx, DWORD PTR [gc+fifoRoom]
+        mov     eax, dword [vSize]
+        mov     ecx, dword [gc+fifoRoom]
         
         imul    eax, edi
         add     eax, 4
         cmp     ecx, eax
-        jge     SHORT clip_strip_begin
-        push    @Line
+        jge     .clip_strip_begin
+        push    __LINE__
         push    0h
         push    eax
         call    _FifoMakeRoom
         add     esp, 12                 ; 0000000cH
-clip_strip_begin:
+.clip_strip_begin:
 
 ;;;     TRI_STRIP_BEGIN(type, vcount, vSize, pktype)
 
   
-        mov     fifo, DWORD PTR [gc+fifoPtr]
+        mov     fifo, dword [gc+fifoPtr]
         mov     eax, edi
         
-        mov     edx, DWORD PTR packetVal
-        mov     ebp, DWORD PTR [gc+cullStripHdr]
+        mov     edx, dword [packetVal]
+        mov     ebp, dword [gc+cullStripHdr]
         
         or      eax, edx
         add     fifo, 4
         
         shl     eax, 6
-        mov     edx, DWORD PTR [esp+_pktype]
+        mov     edx, dword [esp+_pktype]
         
         or      eax, ebp
 
         or      eax, edx
-        mov     DWORD PTR [fifo-4], eax
+        mov     dword [fifo-4], eax
 
   
-        mov     vertexPtr, DWORD PTR [esp+_pointers]
-        mov     eax, DWORD PTR [esp+_mode]
+        mov     vertexPtr, dword [esp+_pointers]
+        mov     eax, dword [esp+_mode]
 
         test    eax, eax
         
-        je      SHORT clip_for_begin
-        mov     vertexPtr, DWORD PTR [vertexPtr]
+        je      .clip_for_begin
+        mov     vertexPtr, dword [vertexPtr]
 
         align   4
-clip_for_begin:
+.clip_for_begin:
 
         add     fifo, 8
-        mov     ebp, DWORD PTR strideinbytes
+        mov     ebp, dword [strideinbytes]
 
-        add     DWORD PTR [esp+_pointers], ebp
-        mov     eax, DWORD PTR [gc+paramIndex]
+        add     dword [esp+_pointers], ebp
+        mov     eax, dword [gc+paramIndex]
         
         xor     ebp, ebp
-        mov     ebx, DWORD PTR [gc+tsuDataList]
+        mov     ebx, dword [gc+tsuDataList]
 
 ;;; ;   setup x and y
 
-        fld     DWORD PTR [gc+vp_hwidth]
-        fmul    DWORD PTR [vertexPtr]
+        fld     dword [gc+vp_hwidth]
+        fmul    dword [vertexPtr]
         test    al, 3   
-        fld     DWORD PTR [gc+vp_hheight]
-        fmul    DWORD PTR [vertexPtr+4]
+        fld     dword [gc+vp_hheight]
+        fmul    dword [vertexPtr+4]
         fxch
-        fmul    st, st(2)
+        fmul    st0, st2
         fxch
-        fmul    st, st(2)
+        fmul    st0, st2
         fxch
-        fadd    DWORD PTR [gc+vp_ox]
+        fadd    dword [gc+vp_ox]
         fxch
-        fadd    DWORD PTR [gc+vp_oy]
-        fxch    st(2)
-        fstp    DWORD PTR ccoow
-        fstp    DWORD PTR [fifo-8]
-        fstp    DWORD PTR [fifo-4]
+        fadd    dword [gc+vp_oy]
+        fxch	st2
+        fstp    dword [ccoow]
+        fstp    dword [fifo-8]
+        fstp    dword [fifo-4]
         
 ;;; ;   set up color
 
-        je      clip_setup_ooz
+        je      .clip_setup_ooz
   
-        cmp     DWORD PTR [gc+colorType], ebp
-        jne     SHORT clip_setup_pargb
+        cmp     dword [gc+colorType], ebp
+        jne     .clip_setup_pargb
   
         test    al, 1
-        je      SHORT clip_setup_a
+        je      .clip_setup_a
         
         add     fifo, 12
         mov     ebp, 3
 
-        fld     DWORD PTR _GlideRoot+pool_f255
-        fmul    DWORD PTR [ebx+vertexPtr]
-        fld     DWORD PTR _GlideRoot+pool_f255
-        fmul    DWORD PTR [ebx+vertexPtr+4]
-        fld     DWORD PTR _GlideRoot+pool_f255
-        fmul    DWORD PTR [ebx+vertexPtr+8]
-        fxch    st(2)
-        fstp    DWORD PTR [fifo-12]
-        fstp    DWORD PTR [fifo-8]
-        fstp    DWORD PTR [fifo-4]
-        mov     ebx, DWORD PTR [gc+tsuDataList+12]
+        fld     dword [_GlideRoot+pool_f255]
+        fmul    dword [ebx+vertexPtr]
+        fld     dword [_GlideRoot+pool_f255]
+        fmul    dword [ebx+vertexPtr+4]
+        fld     dword [_GlideRoot+pool_f255]
+        fmul    dword [ebx+vertexPtr+8]
+        fxch	st2
+        fstp    dword [fifo-12]
+        fstp    dword [fifo-8]
+        fstp    dword [fifo-4]
+        mov     ebx, dword [gc+tsuDataList+12]
         
         align 4
-clip_setup_a:
+.clip_setup_a:
   
         test    al, 2
-        je      SHORT clip_setup_ooz
+        je      .clip_setup_ooz
 
         add     fifo, 4
         inc     ebp
 
-        fld     DWORD PTR [ebx+vertexPtr]  
-        fmul    DWORD PTR _GlideRoot+pool_f255
-        fstp    DWORD PTR [fifo-4]
+        fld     dword [ebx+vertexPtr]  
+        fmul    dword [_GlideRoot+pool_f255]
+        fstp    dword [fifo-4]
   
-        mov     ebx, DWORD PTR [gc+ebp*4+tsuDataList]
-        jmp     SHORT clip_setup_ooz
+        mov     ebx, dword [gc+ebp*4+tsuDataList]
+        jmp     .clip_setup_ooz
         
         align 4
-clip_setup_pargb:
+.clip_setup_pargb:
         add     fifo, 4
-        mov     ebx, DWORD PTR [ebx+vertexPtr]
+        mov     ebx, dword [ebx+vertexPtr]
 
-        mov     DWORD PTR [fifo-4], ebx
+        mov     dword [fifo-4], ebx
         nop
         
         mov     ebp, 1  
-        mov     ebx, DWORD PTR [gc+tsuDataList+4]
-clip_setup_ooz:
+        mov     ebx, dword [gc+tsuDataList+4]
+.clip_setup_ooz:
   
         test    al, 4
-        je      SHORT clip_setup_qow
+        je      .clip_setup_qow
 
         add     fifo, 4
         inc     ebp
 
-        fld     DWORD PTR [ebx+vertexPtr]  
-        fmul    DWORD PTR [gc+vp_hdepth]
-        fmul    DWORD PTR ccoow  
-        fadd    DWORD PTR [gc+vp_oz]
-        fstp    DWORD PTR [fifo-4]
+        fld     dword [ebx+vertexPtr]  
+        fmul    dword [gc+vp_hdepth]
+        fmul    dword [ccoow]
+        fadd    dword [gc+vp_oz]
+        fstp    dword [fifo-4]
   
-        mov     ebx, DWORD PTR [gc+ebp*4+tsuDataList]
+        mov     ebx, dword [gc+ebp*4+tsuDataList]
         align 4
-clip_setup_qow:
+.clip_setup_qow:
   
         test    al, 8
-        je      SHORT clip_setup_qow0
+        je      .clip_setup_qow0
   
-        mov     ebx, DWORD PTR [gc+qInfo_mode]
+        mov     ebx, dword [gc+qInfo_mode]
         test    ebx, ebx
-        je      SHORT clip_setup_oow
-        mov     ebx, DWORD PTR [gc+qInfo_offset]
+        je      .clip_setup_oow
+        mov     ebx, dword [gc+qInfo_offset]
 
-        fld     DWORD PTR [vertexPtr+ebx]
-        fmul    DWORD PTR ccoow
-        fstp    DWORD PTR [fifo]
+        fld     dword [vertexPtr+ebx]
+        fmul    dword [ccoow]
+        fstp    dword [fifo]
 
-        jmp     SHORT clip_setup_oow_inc
+        jmp     .clip_setup_oow_inc
         align 4
-clip_setup_oow:
-        mov     ebx, DWORD PTR ccoow
+.clip_setup_oow:
+        mov     ebx, dword [ccoow]
 
-        mov     DWORD PTR [fifo], ebx
+        mov     dword [fifo], ebx
         align 4
-clip_setup_oow_inc:
+.clip_setup_oow_inc:
   
-        mov     ebx, DWORD PTR [gc+ebp*4+tsuDataList+4]
+        mov     ebx, dword [gc+ebp*4+tsuDataList+4]
         add     fifo, 4
         
         inc     ebp
         align 4
-clip_setup_qow0:
+.clip_setup_qow0:
   
         test    al, 16
-        je      SHORT clip_setup_stow0
+        je      .clip_setup_stow0
   
-        mov     ebx, DWORD PTR [gc+q0Info_mode]
+        mov     ebx, dword [gc+q0Info_mode]
         cmp     ebx, 1
-        jne     SHORT clip_setup_oow0
+        jne     .clip_setup_oow0
   
-        mov     ebx, DWORD PTR [gc+q0Info_offset]
+        mov     ebx, dword [gc+q0Info_offset]
         
-        fld     DWORD PTR [ebx+vertexPtr]
-        fmul    DWORD PTR ccoow
-        fstp    DWORD PTR [fifo]
+        fld     dword [ebx+vertexPtr]
+        fmul    dword [ccoow]
+        fstp    dword [fifo]
         
-        jmp     SHORT clip_setup_oow0_inc
+        jmp     .clip_setup_oow0_inc
         align 4
-clip_setup_oow0:
-        mov     ebx, DWORD PTR ccoow
+.clip_setup_oow0:
+        mov     ebx, dword [ccoow]
         
-        mov     DWORD PTR [fifo], ebx
+        mov     dword [fifo], ebx
         align 4
-clip_setup_oow0_inc:
-        mov     ebx, DWORD PTR [gc+ebp*4+tsuDataList+4]
+.clip_setup_oow0_inc:
+        mov     ebx, dword [gc+ebp*4+tsuDataList+4]
         add     fifo, 4
         
         inc     ebp
         align 4
-clip_setup_stow0:
+.clip_setup_stow0:
   
         test    al, 32
-        je      SHORT clip_setup_qow1
+        je      .clip_setup_qow1
         
 
-        fld     DWORD PTR ccoow
-        fmul    DWORD PTR [ebx+vertexPtr]
+        fld     dword [ccoow]
+        fmul    dword [ebx+vertexPtr]
 
         add     fifo, 8
         add     ebp, 2
 
-        fmul    DWORD PTR [gc+tmu0_s_scale]
-        fld     DWORD PTR ccoow
-        fmul    DWORD PTR [ebx+vertexPtr+4]
-        mov     ebx, DWORD PTR [gc+ebp*4+tsuDataList]
-        fmul    DWORD PTR [gc+tmu0_t_scale]
+        fmul    dword [gc+tmu0_s_scale]
+        fld     dword [ccoow]
+        fmul    dword [ebx+vertexPtr+4]
+        mov     ebx, dword [gc+ebp*4+tsuDataList]
+        fmul    dword [gc+tmu0_t_scale]
         fxch
-        fstp    DWORD PTR [fifo-8]
-        fstp    DWORD PTR [fifo-4]
+        fstp    dword [fifo-8]
+        fstp    dword [fifo-4]
         
         align 4
-clip_setup_qow1:
+.clip_setup_qow1:
 
         test    al, 64
-        je      SHORT clip_setup_stow1
+        je      .clip_setup_stow1
 
-        mov     ebx, DWORD PTR [gc+q1Info_mode]
+        mov     ebx, dword [gc+q1Info_mode]
         cmp     ebx, 1
-        jne     SHORT clip_setup_oow1
+        jne     .clip_setup_oow1
 
-        mov     ebx, DWORD PTR [gc+q1Info_offset]
+        mov     ebx, dword [gc+q1Info_offset]
         
-        fld     DWORD PTR [ebx+vertexPtr]
-        fmul    DWORD PTR ccoow
-        fstp    DWORD PTR [fifo]
+        fld     dword [ebx+vertexPtr]
+        fmul    dword [ccoow]
+        fstp    dword [fifo]
         
-        jmp     SHORT clip_setup_oow1_inc
+        jmp     .clip_setup_oow1_inc
         align 4
-clip_setup_oow1:
-        mov     ebx, DWORD PTR ccoow
+.clip_setup_oow1:
+        mov     ebx, dword [ccoow]
 
-        mov     DWORD PTR [fifo], ebx
+        mov     dword [fifo], ebx
         align 4
-clip_setup_oow1_inc:
+.clip_setup_oow1_inc:
   
-        mov     ebx, DWORD PTR [gc+ebp*4+tsuDataList+4]
+        mov     ebx, dword [gc+ebp*4+tsuDataList+4]
         add     fifo, 4
 
         inc     ebp
 
         align 4
-clip_setup_stow1:
+.clip_setup_stow1:
   
         test    al, 128
-        je      SHORT clip_setup_end
+        je      .clip_setup_end
 
-        fld     DWORD PTR ccoow
-        fmul    DWORD PTR [ebx+vertexPtr]
+        fld     dword [ccoow]
+        fmul    dword [ebx+vertexPtr]
         add     fifo, 8
-        fmul    DWORD PTR [gc+tmu1_s_scale]
-        fld     DWORD PTR ccoow
-        fmul    DWORD PTR [ebx+vertexPtr+4]
-        mov     ebx, DWORD PTR [gc+ebp*4+tsuDataList+4]
-        fmul    DWORD PTR [gc+tmu1_t_scale]
+        fmul    dword [gc+tmu1_s_scale]
+        fld     dword [ccoow]
+        fmul    dword [ebx+vertexPtr+4]
+        mov     ebx, dword [gc+ebp*4+tsuDataList+4]
+        fmul    dword [gc+tmu1_t_scale]
         fxch        
-        fstp    DWORD PTR [fifo-8]
-        fstp    DWORD PTR [fifo-4]
+        fstp    dword [fifo-8]
+        fstp    dword [fifo-4]
 
         align 4
-clip_setup_end:
+.clip_setup_end:
 
         dec     edi        
-        jz      clip_for_end
+        jz      .clip_for_end
 
-        mov     vertexPtr, DWORD PTR [esp+_pointers]
-        mov     ebx, DWORD PTR [esp+_mode]
+        mov     vertexPtr, dword [esp+_pointers]
+        mov     ebx, dword [esp+_mode]
 
         test    ebx, ebx
-        je      SHORT w_clip_no_deref
+        je      .w_clip_no_deref
 
 
-        mov     vertexPtr, DWORD PTR [vertexPtr]
+        mov     vertexPtr, dword [vertexPtr]
         align 4
-w_clip_no_deref:
+.w_clip_no_deref:
 
-        mov     ebx, DWORD PTR [gc+wInfo_offset]
+        mov     ebx, dword [gc+wInfo_offset]
         
-        fld     DWORD PTR [ebx+vertexPtr]  
-        fdivr   DWORD PTR _F1
+        fld     dword [ebx+vertexPtr]  
+        fdivr   dword [_F1]
         
-        jmp     clip_for_begin
+        jmp     .clip_for_begin
         align 4
-clip_for_end:
+.clip_for_end:
   
-        mov     ebx, DWORD PTR [gc+fifoPtr]
-        mov     edx, DWORD PTR [gc+fifoRoom]
+        mov     ebx, dword [gc+fifoPtr]
+        mov     edx, dword [gc+fifoRoom]
         
         sub     ebx, fifo
-        mov     edi, DWORD PTR [esp+_count]
+        mov     edi, dword [esp+_count]
         
         add     edx, ebx
         sub     edi, 15                 ; 0000000fH
   
-        mov     DWORD PTR [gc+fifoRoom], edx
-        mov     DWORD PTR [esp+_count], edi
+        mov     dword [gc+fifoRoom], edx
+        mov     dword [esp+_count], edi
   
-        mov     DWORD PTR [gc+fifoPtr], fifo
-        mov     DWORD PTR [esp+_pktype], 16 ; 00000010H
+        mov     dword [gc+fifoPtr], fifo
+        mov     dword [esp+_pktype], 16 ; 00000010H
 
-        jle     strip_done
-        mov     edx, DWORD PTR [esp+_pointers]
+        jle     .strip_done
+        mov     edx, dword [esp+_pointers]
 
-        mov     ebx, DWORD PTR [esp+_mode]
+        mov     ebx, dword [esp+_mode]
         test    ebx, ebx
         
-        je      SHORT w1_clip_no_deref
-        mov     edx, DWORD PTR [edx]
+        je      .w1_clip_no_deref
+        mov     edx, dword [edx]
         
         align 4
-w1_clip_no_deref:
+.w1_clip_no_deref:
 
-        mov     ebx, DWORD PTR [gc+wInfo_offset]
-        fld     DWORD PTR [ebx+edx]  
-        fdivr   DWORD PTR _F1
+        mov     ebx, dword [gc+wInfo_offset]
+        fld     dword [ebx+edx]  
+        fdivr   dword [_F1]
   
-        jmp     clip_coords_begin
+        jmp     .clip_coords_begin
         align 4
-strip_done:
+.strip_done:
 
         pop     ebp
         pop     ebx
         pop     edi
         pop     esi
-        ret     20                      ; 00000014H
-_vpdrawvertexlist@20 ENDP
+        ret                             ; 00000014H
+endp
 
-gc                 TEXTEQU     <esi>       ; points to graphics context
-fifo               TEXTEQU     <ecx>       ; points to next entry in fifo
-vertexPtr          TEXTEQU     <edi>       ; Current vertex pointer
+%define gc                 esi       ; points to graphics context
+%define fifo               ecx       ; points to next entry in fifo
+%define vertexPtr          edi       ; Current vertex pointer
 
-    PUBLIC  _vptrisetup_nocull@12
-_vptrisetup_nocull@12 PROC NEAR
-_va = 20
-_vb = 24
-_vc = 28
+proc _vptrisetup_nocull, 12
+_va equ 20
+_vb equ 24
+_vc equ 28
         push    ebx
         push    esi
         
         push    edi
-        mov     gc, DWORD PTR _GlideRoot+curGC
+        mov     gc, dword [_GlideRoot+curGC]
         
-        mov     ecx, DWORD PTR [esp+_va-4]
-        mov     eax, DWORD PTR [gc+wInfo_offset]
+        mov     ecx, dword [esp+_va-4]
+        mov     eax, dword [gc+wInfo_offset]
         
         push    ebp
         nop
         
 ;;; ;   oow[0] = 1.0f / FARRAY(va, gc->state.vData.wInfo.offset)
 
-        fld     DWORD PTR [eax+ecx]
+        fld     dword [eax+ecx]
 
-        fdivr   DWORD PTR _F1
+        fdivr   dword [_F1]
 
-        mov     ecx, DWORD PTR [esp+_vb]
-        mov     ebx, DWORD PTR [esp+_vc]
+        mov     ecx, dword [esp+_vb]
+        mov     ebx, dword [esp+_vc]
 
         nop
         nop
         
-        mov     ebp, DWORD PTR [eax+ecx]
-        mov     edi, DWORD PTR [eax+ebx]
+        mov     ebp, dword [eax+ecx]
+        mov     edi, dword [eax+ebx]
 
-        mov     DWORD PTR vPtr1, ebp
-        mov     DWORD PTR vPtr2, edi
+        mov     dword [vPtr1], ebp
+        mov     dword [vPtr2], edi
         
 ;;; ;   GR_SET_EXPECTED_SIZE(_GlideRoot.curTriSize, 1)
 
-        mov     eax, DWORD PTR _GlideRoot+curTriSize
-        mov     ecx, DWORD PTR [gc+fifoRoom]
+        mov     eax, dword [_GlideRoot+curTriSize]
+        mov     ecx, dword [gc+fifoRoom]
         
         add     eax, 4
         nop
         
         cmp     ecx, eax
-        jge     SHORT setup_pkt_hdr
+        jge     .setup_pkt_hdr
         
-        push    @Line                      ; line number inside this function
+        push    __LINE__                   ; line number inside this function
         push    0h                         ; pointer to function name = NULL
 
         push    eax
@@ -4525,265 +4500,265 @@ _vc = 28
         add     esp, 12                 ; 0000000cH
 
         align 4
-setup_pkt_hdr:  
+.setup_pkt_hdr:
 
 ;;; ;   TRI_STRIP_BEGIN(kSetupStrip, 3, gc->state.vData.vSize, SSTCP_PKT3_BDDBDD)
 
 
-        mov     fifo, DWORD PTR [gc+fifoPtr]
-        mov     eax, DWORD PTR [gc+cullStripHdr]
+        mov     fifo, dword [gc+fifoPtr]
+        mov     eax, dword [gc+cullStripHdr]
 
         add     fifo, 4
-        lea     ebp, DWORD PTR [esp+_va]
+        lea     ebp, [esp+_va]
 
         or      eax, 192                ; 000000c0H
         mov     edx, 0                
 
-        mov     DWORD PTR [fifo-4], eax        
-        mov     vertexPtr, DWORD PTR [ebp]
+        mov     dword [fifo-4], eax        
+        mov     vertexPtr, dword [ebp]
 
-        mov     eax, DWORD PTR [gc+paramIndex]
+        mov     eax, dword [gc+paramIndex]
         nop
 
 ;;; Begin loop
         
         align 4
-begin_for_loop:
+.begin_for_loop:
 
         add     edx, 4
         add     fifo, 8
         
         xor     ebx, ebx
-        mov     ebp, DWORD PTR [gc+tsuDataList]
+        mov     ebp, dword [gc+tsuDataList]
 
 ;;; ;   setup x and y
 
-        fld     DWORD PTR [gc+vp_hwidth]
-        fmul    DWORD PTR [vertexPtr]
+        fld     dword [gc+vp_hwidth]
+        fmul    dword [vertexPtr]
         test    al, 3   
-        fld     DWORD PTR [gc+vp_hheight]
-        fmul    DWORD PTR [vertexPtr+4]
+        fld     dword [gc+vp_hheight]
+        fmul    dword [vertexPtr+4]
         fxch
-        fmul    st, st(2)
+        fmul    st0, st2
         fxch
-        fmul    st, st(2)
+        fmul    st0, st2
         fxch
-        fadd    DWORD PTR [gc+vp_ox]
+        fadd    dword [gc+vp_ox]
         fxch
-        fadd    DWORD PTR [gc+vp_oy]
-        fxch    st(2)
-        fstp    DWORD PTR oowa    
-        fstp    DWORD PTR [fifo-8]
-        fstp    DWORD PTR [fifo-4]
+        fadd    dword [gc+vp_oy]
+        fxch	st2
+        fstp    dword [oowa]
+        fstp    dword [fifo-8]
+        fstp    dword [fifo-4]
 
                 
 ;;; ;   set up color
 
-        je      clip_setup_ooz
+        je      .clip_setup_ooz
 
-        cmp     DWORD PTR [gc+colorType], ebx
-        jne     SHORT clip_setup_pargb
+        cmp     dword [gc+colorType], ebx
+        jne     .clip_setup_pargb
         
         test    al, 1
-        je      SHORT clip_setup_a
+        je      .clip_setup_a
 
         add     fifo, 12
         add     ebx, 3
 
-        fld     DWORD PTR _GlideRoot+pool_f255
-        fmul    DWORD PTR [vertexPtr+ebp]
-        fld     DWORD PTR _GlideRoot+pool_f255
-        fmul    DWORD PTR [vertexPtr+ebp+4]
-        fld     DWORD PTR _GlideRoot+pool_f255
-        fmul    DWORD PTR [vertexPtr+ebp+8]
-        fxch    st(2)
-        fstp    DWORD PTR [fifo-12]
-        fstp    DWORD PTR [fifo-8]
-        fstp    DWORD PTR [fifo-4]
-        mov     ebp, DWORD PTR [gc+tsuDataList+12]
+        fld     dword [_GlideRoot+pool_f255]
+        fmul    dword [vertexPtr+ebp]
+        fld     dword [_GlideRoot+pool_f255]
+        fmul    dword [vertexPtr+ebp+4]
+        fld     dword [_GlideRoot+pool_f255]
+        fmul    dword [vertexPtr+ebp+8]
+        fxch	st2
+        fstp    dword [fifo-12]
+        fstp    dword [fifo-8]
+        fstp    dword [fifo-4]
+        mov     ebp, dword [gc+tsuDataList+12]
         
         align 4
-clip_setup_a:          
+.clip_setup_a:
 
         test    al, 2
-        je      SHORT clip_setup_ooz
+        je      .clip_setup_ooz
         
         add     fifo, 4
         inc     ebx
         
-        fld     DWORD PTR [vertexPtr+ebp]
-        fmul    DWORD PTR _GlideRoot+pool_f255
-        fstp    DWORD PTR [fifo-4]
+        fld     dword [vertexPtr+ebp]
+        fmul    dword [_GlideRoot+pool_f255]
+        fstp    dword [fifo-4]
         
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList]
-        jmp     SHORT clip_setup_ooz
+        mov     ebp, dword [gc+ebx*4+tsuDataList]
+        jmp     .clip_setup_ooz
         align 4
-clip_setup_pargb:
+.clip_setup_pargb:
         add     fifo, 4
-        mov     ebx, DWORD PTR [vertexPtr+ebp]
+        mov     ebx, dword [vertexPtr+ebp]
         
-        mov     DWORD PTR [fifo-4], ebx
+        mov     dword [fifo-4], ebx
         nop
         
         mov     ebx, 1
-        mov     ebp, DWORD PTR [gc+tsuDataList+4]
+        mov     ebp, dword [gc+tsuDataList+4]
         align 4
-clip_setup_ooz:
+.clip_setup_ooz:
 
         test    al, 4
-        je      SHORT clip_setup_qow
+        je      .clip_setup_qow
         
         add     fifo, 4
         inc     ebx
         
-        fld     DWORD PTR [vertexPtr+ebp]
-        fmul    DWORD PTR [gc+vp_hdepth]
-        fmul    DWORD PTR oowa
-        fadd    DWORD PTR [gc+vp_oz]
-        fstp    DWORD PTR [fifo-4]
+        fld     dword [vertexPtr+ebp]
+        fmul    dword [gc+vp_hdepth]
+        fmul    dword [oowa]
+        fadd    dword [gc+vp_oz]
+        fstp    dword [fifo-4]
         
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList]
+        mov     ebp, dword [gc+ebx*4+tsuDataList]
         align 4
-clip_setup_qow: 
+.clip_setup_qow:
 
         test    al, 8
-        je      SHORT clip_setup_qow0
+        je      .clip_setup_qow0
 
-        cmp     DWORD PTR [gc+qInfo_mode], 1
-        jne     SHORT clip_setup_oow
+        cmp     dword [gc+qInfo_mode], 1
+        jne     .clip_setup_oow
 
-        mov     ebp, DWORD PTR [gc+qInfo_offset]
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [ebp+vertexPtr]
-        fstp    DWORD PTR [fifo]
+        mov     ebp, dword [gc+qInfo_offset]
+        fld     dword [oowa]
+        fmul    dword [ebp+vertexPtr]
+        fstp    dword [fifo]
         
-        jmp     SHORT clip_setup_oow_inc
+        jmp     .clip_setup_oow_inc
         align 4
-clip_setup_oow:
+.clip_setup_oow:
         
-        mov     ebp, DWORD PTR oowa
+        mov     ebp, dword [oowa]
         
-        mov     DWORD PTR [fifo], ebp
+        mov     dword [fifo], ebp
         align 4
-clip_setup_oow_inc:
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList+4]   
+.clip_setup_oow_inc:
+        mov     ebp, dword [gc+ebx*4+tsuDataList+4]   
         add     fifo, 4
         
         inc     ebx
         align 4
-clip_setup_qow0:
+.clip_setup_qow0:
 
         test    al, 16                  ; 00000010H
-        je      SHORT clip_setup_stow0
+        je      .clip_setup_stow0
 
-        cmp     DWORD PTR [gc+q0Info_mode], 1        
-        jne     SHORT clip_setup_oow0
+        cmp     dword [gc+q0Info_mode], 1        
+        jne     .clip_setup_oow0
                         
-        mov     ebp, DWORD PTR [gc+q0Info_offset]
+        mov     ebp, dword [gc+q0Info_offset]
         
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [ebp+vertexPtr]
-        fstp    DWORD PTR [fifo]
+        fld     dword [oowa]
+        fmul    dword [ebp+vertexPtr]
+        fstp    dword [fifo]
         
-        jmp     SHORT clip_setup_oow0_inc
+        jmp     .clip_setup_oow0_inc
         align 4
-clip_setup_oow0:
-        mov     ebp, DWORD PTR oowa
+.clip_setup_oow0:
+        mov     ebp, dword [oowa]
         
-        mov     DWORD PTR [fifo], ebp
+        mov     dword [fifo], ebp
         align 4
-clip_setup_oow0_inc:
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList+4]
+.clip_setup_oow0_inc:
+        mov     ebp, dword [gc+ebx*4+tsuDataList+4]
         add     fifo, 4
         
         inc     ebx
         align 4
-clip_setup_stow0:
+.clip_setup_stow0:
                         
         test    al, 32
-        je      SHORT clip_setup_qow1
+        je      .clip_setup_qow1
 
         
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [vertexPtr+ebp]
+        fld     dword [oowa]
+        fmul    dword [vertexPtr+ebp]
 
         add     fifo, 8
         add     ebx, 2
 
-        fmul    DWORD PTR [gc+tmu0_s_scale]
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [vertexPtr+ebp+4]
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList]
-        fmul    DWORD PTR [gc+tmu0_t_scale]
+        fmul    dword [gc+tmu0_s_scale]
+        fld     dword [oowa]
+        fmul    dword [vertexPtr+ebp+4]
+        mov     ebp, dword [gc+ebx*4+tsuDataList]
+        fmul    dword [gc+tmu0_t_scale]
         fxch    
-        fstp    DWORD PTR [fifo-8]
-        fstp    DWORD PTR [fifo-4]
+        fstp    dword [fifo-8]
+        fstp    dword [fifo-4]
         
         align 4
-clip_setup_qow1:
+.clip_setup_qow1:
 
         test    al, 64
-        je      SHORT clip_setup_stow1
+        je      .clip_setup_stow1
 
-        cmp     DWORD PTR [gc+q1Info_mode], 1
-        jne     SHORT clip_setup_oow1
+        cmp     dword [gc+q1Info_mode], 1
+        jne     .clip_setup_oow1
 
-        mov     ebp, DWORD PTR [gc+q1Info_offset]
+        mov     ebp, dword [gc+q1Info_offset]
         
-        fld     DWORD PTR [ebp+vertexPtr]
-        fmul    DWORD PTR oowa
-        fstp    DWORD PTR [fifo]
+        fld     dword [ebp+vertexPtr]
+        fmul    dword [oowa]
+        fstp    dword [fifo]
         
-        jmp     SHORT clip_setup_oow1_inc
+        jmp     .clip_setup_oow1_inc
         align 4
-clip_setup_oow1:
-        mov     ebp, DWORD PTR oowa
+.clip_setup_oow1:
+        mov     ebp, dword [oowa]
         
-        mov     DWORD PTR [fifo], ebp
+        mov     dword [fifo], ebp
         align 4
-clip_setup_oow1_inc:
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList+4]
+.clip_setup_oow1_inc:
+        mov     ebp, dword [gc+ebx*4+tsuDataList+4]
         add     fifo, 4
         
         inc     ebx
         align 4
-clip_setup_stow1:
+.clip_setup_stow1:
 
         test    al, 128
-        je      SHORT clip_setup_end
+        je      .clip_setup_end
         
 
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [vertexPtr+ebp]
+        fld     dword [oowa]
+        fmul    dword [vertexPtr+ebp]
         add     fifo, 8  
-        fmul    DWORD PTR [gc+tmu1_s_scale]
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [vertexPtr+ebp+4]
-        fmul    DWORD PTR [gc+tmu1_t_scale]
+        fmul    dword [gc+tmu1_s_scale]
+        fld     dword [oowa]
+        fmul    dword [vertexPtr+ebp+4]
+        fmul    dword [gc+tmu1_t_scale]
         fxch    
-        fstp    DWORD PTR [fifo-8]
-        fstp    DWORD PTR [fifo-4]
+        fstp    dword [fifo-8]
+        fstp    dword [fifo-4]
 
         align 4
-clip_setup_end:
+.clip_setup_end:
 
         cmp     edx, 12
-        je      update_fifo_ptr
+        je      .update_fifo_ptr
         
-        fld     DWORD PTR vPtr0[edx]
-        fdivr   DWORD PTR _F1
+        fld     dword [vPtr0+edx]
+        fdivr   dword [_F1]
         
-        lea     ebx, DWORD PTR [esp+_va]
-        mov     ebp, DWORD PTR [gc+wInfo_offset]
+        lea     ebx, [esp+_va]
+        mov     ebp, dword [gc+wInfo_offset]
         
-        mov     vertexPtr, DWORD PTR [ebx+edx]
-        jmp     begin_for_loop
+        mov     vertexPtr, dword [ebx+edx]
+        jmp     .begin_for_loop
        
         align 4
-update_fifo_ptr:        
+.update_fifo_ptr:
 
-        mov     ebx, DWORD PTR [gc+fifoPtr]
-        mov     edx, DWORD PTR [gc+fifoRoom]
+        mov     ebx, dword [gc+fifoPtr]
+        mov     edx, dword [gc+fifoRoom]
         
         sub     ebx, fifo
         mov     eax, 1
@@ -4791,11 +4766,11 @@ update_fifo_ptr:
         add     edx, ebx
         pop     ebp
 
-        mov     DWORD PTR [gc+fifoRoom], edx
+        mov     dword [gc+fifoRoom], edx
         pop     edi
 
-        mov     DWORD PTR [gc+fifoPtr], fifo
-        mov     ebx, DWORD PTR _GlideRoot+trisProcessed
+        mov     dword [gc+fifoPtr], fifo
+        mov     ebx, dword [_GlideRoot+trisProcessed]
 
 ;;; ;   _GlideRoot.stats.trisProcessed++
 
@@ -4803,63 +4778,59 @@ update_fifo_ptr:
         pop     esi
         inc     ebx
 
-        mov     DWORD PTR _GlideRoot+trisProcessed, ebx
+        mov     dword [_GlideRoot+trisProcessed], ebx
         pop     ebx
         
-        ret     12                      ; 0000000cH
+        ret                             ; 0000000cH
         
-_vptrisetup_nocull@12 ENDP
+endp
 
-    PUBLIC  _vptrisetup_cull@12
-_vptrisetup_cull@12 PROC NEAR
+proc _vptrisetup_cull, 12
         
-_va = 20
-_vb = 24
-_vc = 28
         push    ebx
         push    esi
         
         push    edi
-        mov     gc, DWORD PTR _GlideRoot+curGC
+        mov     gc, dword [_GlideRoot+curGC]
         
-        mov     ecx, DWORD PTR [esp+_va-4]
+        mov     ecx, dword [esp+_va-4]
         push    ebp
 
-        mov     eax, DWORD PTR [gc+wInfo_offset]
+        mov     eax, dword [gc+wInfo_offset]
         nop
         
 ;;; ;   oow[0] = 1.0f / FARRAY(va, gc->state.vData.wInfo.offset)
 
-        fld     DWORD PTR [eax+ecx]
+        fld     dword [eax+ecx]
 
-        fdivr   DWORD PTR _F1
+        fdivr   dword [_F1]
 
-        mov     DWORD PTR vPtr0, ecx
+        mov     dword [vPtr0], ecx
 
-        mov     ecx, DWORD PTR [esp+_vb]
-        mov     ebx, DWORD PTR [esp+_vc]
+        mov     ecx, dword [esp+_vb]
+        mov     ebx, dword [esp+_vc]
 
-        fld     DWORD PTR [eax+ecx]
-        fdivr   DWORD PTR _F1
+        fld     dword [eax+ecx]
+        fdivr   dword [_F1]
 
-        mov     DWORD PTR vPtr1, ecx
-        mov     DWORD PTR vPtr2, ebx
+        mov     dword [vPtr1], ecx
+        mov     dword [vPtr2], ebx
 
-        fld     DWORD PTR [eax+ebx]
-        fdivr   DWORD PTR _F1
-        fxch    st(2)                           ;  oow2  oow1  oow0
+        fld     dword [eax+ebx]
+        fdivr   dword [_F1]
+        fxch    st2                           ;  oow2  oow1  oow0
 
 ;;; ;   GR_SET_EXPECTED_SIZE(_GlideRoot.curTriSize, 1)
-        mov     eax, DWORD PTR _GlideRoot+curTriSize
-        mov     ecx, DWORD PTR [gc+fifoRoom]
+        mov     eax, dword [_GlideRoot+curTriSize]
+        mov     ecx, dword [gc+fifoRoom]
         
         add     eax, 4
         nop
         
         cmp     ecx, eax
-        jge     SHORT setup_pkt_hdr
+        jge     .setup_pkt_hdr
         
-        push    @Line                      ; line number inside this function
+        push    __LINE__                   ; line number inside this function
         push    0h                         ; pointer to function name = NULL
 
         push    eax
@@ -4868,102 +4839,102 @@ _vc = 28
         add     esp, 12                 ; 0000000cH
 
         align 4
-setup_pkt_hdr: 
+.setup_pkt_hdr:
 
         mov     edx, [gc + cull_mode]    
         shl     edx, 31                    ; culltest << 31    
 
 ;;; ;   culling
 
-        mov     vertexPtr, DWORD PTR [esp+_va]  ;
-        fld     DWORD PTR [gc+vp_hwidth]        ;  oow2  oow1  oow0  tx
-        fmul    DWORD PTR [vertexPtr]           ;  |     |     |     |
-        fld     DWORD PTR [gc+vp_hheight]       ;  |     |     |     |     ty
-        fmul    DWORD PTR [vertexPtr+4]         ;  |     |     |     |     |
-        fxch                                    ;  |     |     |     ty    tx
-        fmul    st, st(2)                       ;  |     |     |     |     |
-        fxch                                    ;  |     |     |     tx    ty
-        fmul    st, st(2)                       ;  |     |     |     |     |
-        fxch                                    ;  |     |     |     ty    tx
-        fadd    DWORD PTR [gc+vp_ox]            ;  |     |     |     |     xa
-        fxch                                    ;  |     |     |     xa    ty
-        fadd    DWORD PTR [gc+vp_oy]            ;  |     |     |     |     ya
-        fxch    st(2)                           ;  |     |     ya    |     oow0
-        fstp    DWORD PTR oow0                  ;  |     |     |     |
-        fstp    DWORD PTR xa                    ;  |     |     |
-        fstp    DWORD PTR ya                    ;  |     |
+        mov     vertexPtr, dword [esp+_va]  ;
+        fld     dword [gc+vp_hwidth]        ;  oow2  oow1  oow0  tx
+        fmul    dword [vertexPtr]           ;  |     |     |     |
+        fld     dword [gc+vp_hheight]       ;  |     |     |     |     ty
+        fmul    dword [vertexPtr+4]         ;  |     |     |     |     |
+        fxch                                ;  |     |     |     ty    tx
+        fmul    st0, st2                    ;  |     |     |     |     |
+        fxch                                ;  |     |     |     tx    ty
+        fmul    st0, st2                    ;  |     |     |     |     |
+        fxch                                ;  |     |     |     ty    tx
+        fadd    dword [gc+vp_ox]            ;  |     |     |     |     xa
+        fxch                                ;  |     |     |     xa    ty
+        fadd    dword [gc+vp_oy]            ;  |     |     |     |     ya
+        fxch    st2                         ;  |     |     ya    |     oow0
+        fstp    dword [oow0]                ;  |     |     |     |
+        fstp    dword [xa]                  ;  |     |     |
+        fstp    dword [ya]                  ;  |     |
                 
-        mov     vertexPtr, DWORD PTR [esp+_vb]  ;
-        fld     DWORD PTR [gc+vp_hwidth]        ;  oow2  oow1  tx
-        fmul    DWORD PTR [vertexPtr]           ;  |     |     |
-        fld     DWORD PTR [gc+vp_hheight]       ;  |     |     |     ty
-        fmul    DWORD PTR [vertexPtr+4]         ;  |     |     |     |
-        fxch                                    ;  |     |     ty    tx
-        fmul    st, st(2)                       ;  |     |     |     |
-        fxch                                    ;  |     |     tx    ty
-        fmul    st, st(2)                       ;  |     |     |     |
-        fxch                                    ;  |     |     ty    tx
-        fadd    DWORD PTR [gc+vp_ox]            ;  |     |     |     xb
-        fxch                                    ;  |     |     xb    ty
-        fadd    DWORD PTR [gc+vp_oy]            ;  |     |     |     yb
-        fxch    st(2)                           ;  |     yb    |     oow1
-        fstp    DWORD PTR oow1                  ;  |     |     |
-        fstp    DWORD PTR xb                    ;  |     |
-        fstp    DWORD PTR yb                    ;  |
+        mov     vertexPtr, dword [esp+_vb]  ;
+        fld     dword [gc+vp_hwidth]        ;  oow2  oow1  tx
+        fmul    dword [vertexPtr]           ;  |     |     |
+        fld     dword [gc+vp_hheight]       ;  |     |     |     ty
+        fmul    dword [vertexPtr+4]         ;  |     |     |     |
+        fxch                                ;  |     |     ty    tx
+        fmul    st0, st2                    ;  |     |     |     |
+        fxch                                ;  |     |     tx    ty
+        fmul    st0, st2                    ;  |     |     |     |
+        fxch                                ;  |     |     ty    tx
+        fadd    dword [gc+vp_ox]            ;  |     |     |     xb
+        fxch                                ;  |     |     xb    ty
+        fadd    dword [gc+vp_oy]            ;  |     |     |     yb
+        fxch    st2                         ;  |     yb    |     oow1
+        fstp    dword [oow1]                ;  |     |     |
+        fstp    dword [xb]                  ;  |     |
+        fstp    dword [yb]                  ;  |
 
-        mov     vertexPtr, DWORD PTR [esp+_vc]  ;
-        fld     DWORD PTR [gc+vp_hwidth]        ;  |     tx
-        fmul    DWORD PTR [vertexPtr]           ;  |     |
-        fld     DWORD PTR [gc+vp_hheight]       ;  |     |     ty
-        fmul    DWORD PTR [vertexPtr+4]         ;  |     |     |
-        fxch                                    ;  |     ty    tx
-        fmul    st, st(2)                       ;  |     |     |
-        fxch                                    ;  |     tx    ty
-        fmul    st, st(2)                       ;  |     |     |
-        fxch                                    ;  |     ty    tx
-        fadd    DWORD PTR [gc+vp_ox]            ;  |     |     xc
-        fxch                                    ;  |     xc    ty
-        fadd    DWORD PTR [gc+vp_oy]            ;  |     |     yc
-        fxch    st(2)                           ;  yc    |     oow2
-        fstp    DWORD PTR oow1                  ;  |     |
-        fstp    DWORD PTR xc                    ;  |
-        fstp    DWORD PTR yc                    ;
+        mov     vertexPtr, dword [esp+_vc]  ;
+        fld     dword [gc+vp_hwidth]        ;  |     tx
+        fmul    dword [vertexPtr]           ;  |     |
+        fld     dword [gc+vp_hheight]       ;  |     |     ty
+        fmul    dword [vertexPtr+4]         ;  |     |     |
+        fxch                                ;  |     ty    tx
+        fmul    st0, st2                    ;  |     |     |
+        fxch                                ;  |     tx    ty
+        fmul    st0, st2                    ;  |     |     |
+        fxch                                ;  |     ty    tx
+        fadd    dword [gc+vp_ox]            ;  |     |     xc
+        fxch                                ;  |     xc    ty
+        fadd    dword [gc+vp_oy]            ;  |     |     yc
+        fxch    st2                         ;  yc    |     oow2
+        fstp    dword [oow1]                ;  |     |
+        fstp    dword [xc]                  ;  |
+        fstp    dword [yc]                  ;
 
-        fld     DWORD PTR xa                    ;  xa
-        fsub    DWORD PTR xb                    ;  dxAB
-        fld     DWORD PTR yb                    ;  |    yb
-        fsub    DWORD PTR yc                    ;  |    dyBC
-        fld     DWORD PTR xb                    ;  |    |    xb
-        fsub    DWORD PTR xc                    ;  |    |    dxBC
-        fxch                                    ;  |    dxBC dyBC
-        fld     DWORD PTR ya                    ;  |    |    |    ya
-        fsub    DWORD PTR yb                    ;  |    |    |    dyAB
-        fxch    st(3)                           ;  dyAB |    |    dxAB
-        fmulp   st(1), st                       ;  |    |    t0=dxAB*dyBC
-        fxch    st(2)                           ;  t0   |    dxAB
-        fmulp   st(1), st                       ;  t0   t1
-        fsubp   st(1),st                        ;  area
-        fstp    DWORD PTR zArea
+        fld     dword [xa]                  ;  xa
+        fsub    dword [xb]                  ;  dxAB
+        fld     dword [yb]                  ;  |    yb
+        fsub    dword [yc]                  ;  |    dyBC
+        fld     dword [xb]                  ;  |    |    xb
+        fsub    dword [xc]                  ;  |    |    dxBC
+        fxch                                ;  |    dxBC dyBC
+        fld     dword [ya]                  ;  |    |    |    ya
+        fsub    dword [yb]                  ;  |    |    |    dyAB
+        fxch    st3                         ;  dyAB |    |    dxAB
+        fmulp   st1, st0                    ;  |    |    t0=dxAB*dyBC
+        fxch    st2                         ;  t0   |    dxAB
+        fmulp   st1, st0                    ;  t0   t1
+        fsubp   st1,st0                     ;  area
+        fstp    dword [zArea]
         
-        mov     ecx, DWORD PTR zArea        ; j = *(long *)&area
+        mov     ecx, dword [zArea]    ; j = *(long *)&area
         xor     eax, eax              ; Clear the return value (0 == culled)
 
         ; Zero Area Triangle Check
         and     ecx, 7fffffffh    ; if ((j & 0x7FFFFFFF) == 0)
-        jz      __triDone
+        jz      .__triDone
 
         ;; Triangle area check vs culling mode
-        mov     ecx, DWORD PTR zArea              ; reload area just in case we're culling
+        mov     ecx, dword [zArea]              ; reload area just in case we're culling
         xor     ecx, edx               ; if (j ^ (culltest << 31))
     
-        jge     __triDone
+        jge     .__triDone
 
 
 ;;; ;   TRI_STRIP_BEGIN(kSetupStrip, 3, gc->state.vData.vSize, SSTCP_PKT3_BDDBDD)
 
 
-        mov     fifo, DWORD PTR [gc+fifoPtr]
-        mov     eax, DWORD PTR [gc+cullStripHdr]
+        mov     fifo, dword [gc+fifoPtr]
+        mov     eax, dword [gc+cullStripHdr]
 
         add     fifo, 4
         mov     edx, 0                
@@ -4971,233 +4942,233 @@ setup_pkt_hdr:
         or      eax, 192                ; 000000c0H
         nop
 
-        mov     DWORD PTR [fifo-4], eax        
-        mov     eax, DWORD PTR [gc+paramIndex]
+        mov     dword [fifo-4], eax        
+        mov     eax, dword [gc+paramIndex]
 
 
 ;;; Begin loop
         
         align 4
-begin_for_loop:
-        mov     ebp, DWORD PTR oow0[edx]
+.begin_for_loop:
+        mov     ebp, dword [oow0+edx]
         add     fifo, 8        
 
-        mov     vertexPtr, DWORD PTR vPtr0[edx]
-        mov     DWORD PTR oowa, ebp
+        mov     vertexPtr, dword [vPtr0+edx]
+        mov     dword [oowa], ebp
 
 
 ;;; ;   setup x and y
 
-        mov     ebx, DWORD PTR xa[edx*2]
-        mov     ebp, DWORD PTR xa[edx*2+4]
+        mov     ebx, dword [xa+edx*2]
+        mov     ebp, dword [xa+edx*2+4]
 
-        mov     DWORD PTR [fifo-8], ebx        
+        mov     dword [fifo-8], ebx        
         add     edx, 4
 
-        mov     DWORD PTR [fifo-4], ebp
+        mov     dword [fifo-4], ebp
         xor     ebx, ebx
 
         
-        mov     ebp, DWORD PTR [gc+tsuDataList]
+        mov     ebp, dword [gc+tsuDataList]
         test    al, 3
         
 ;;; ;   set up color
 
-        je      clip_setup_ooz
+        je      .clip_setup_ooz
 
-        cmp     DWORD PTR [gc+colorType], ebx
-        jne     SHORT clip_setup_pargb
+        cmp     dword [gc+colorType], ebx
+        jne     .clip_setup_pargb
         
         test    al, 1
-        je      SHORT clip_setup_a
+        je      .clip_setup_a
 
         add     fifo, 12
         add     ebx, 3
 
-        fld     DWORD PTR _GlideRoot+pool_f255
-        fmul    DWORD PTR [vertexPtr+ebp]
-        fld     DWORD PTR _GlideRoot+pool_f255
-        fmul    DWORD PTR [vertexPtr+ebp+4]
-        fld     DWORD PTR _GlideRoot+pool_f255
-        fmul    DWORD PTR [vertexPtr+ebp+8]
-        fxch    st(2)
-        fstp    DWORD PTR [fifo-12]
-        fstp    DWORD PTR [fifo-8]
-        fstp    DWORD PTR [fifo-4]
-        mov     ebp, DWORD PTR [gc+tsuDataList+12]
+        fld     dword [_GlideRoot+pool_f255]
+        fmul    dword [vertexPtr+ebp]
+        fld     dword [_GlideRoot+pool_f255]
+        fmul    dword [vertexPtr+ebp+4]
+        fld     dword [_GlideRoot+pool_f255]
+        fmul    dword [vertexPtr+ebp+8]
+        fxch	st2
+        fstp    dword [fifo-12]
+        fstp    dword [fifo-8]
+        fstp    dword [fifo-4]
+        mov     ebp, dword [gc+tsuDataList+12]
         
         align 4
-clip_setup_a:          
+.clip_setup_a:
 
         test    al, 2
-        je      SHORT clip_setup_ooz
+        je      .clip_setup_ooz
         
         add     fifo, 4
         inc     ebx
         
-        fld     DWORD PTR [vertexPtr+ebp]
-        fmul    DWORD PTR _GlideRoot+pool_f255
-        fstp    DWORD PTR [fifo-4]
+        fld     dword [vertexPtr+ebp]
+        fmul    dword [_GlideRoot+pool_f255]
+        fstp    dword [fifo-4]
         
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList]
-        jmp     SHORT clip_setup_ooz
+        mov     ebp, dword [gc+ebx*4+tsuDataList]
+        jmp     .clip_setup_ooz
         align 4
-clip_setup_pargb:
+.clip_setup_pargb:
         add     fifo, 4
-        mov     ebx, DWORD PTR [vertexPtr+ebp]
+        mov     ebx, dword [vertexPtr+ebp]
         
-        mov     DWORD PTR [fifo-4], ebx
+        mov     dword [fifo-4], ebx
         nop
         
         mov     ebx, 1
-        mov     ebp, DWORD PTR [gc+tsuDataList+4]
+        mov     ebp, dword [gc+tsuDataList+4]
         align 4
-clip_setup_ooz:
+.clip_setup_ooz:
 
         test    al, 4
-        je      SHORT clip_setup_qow
+        je      .clip_setup_qow
         
         add     fifo, 4
         inc     ebx
         
-        fld     DWORD PTR [vertexPtr+ebp]
-        fmul    DWORD PTR [gc+vp_hdepth]
-        fmul    DWORD PTR oowa
-        fadd    DWORD PTR [gc+vp_oz]
-        fstp    DWORD PTR [fifo-4]
+        fld     dword [vertexPtr+ebp]
+        fmul    dword [gc+vp_hdepth]
+        fmul    dword [oowa]
+        fadd    dword [gc+vp_oz]
+        fstp    dword [fifo-4]
         
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList]
+        mov     ebp, dword [gc+ebx*4+tsuDataList]
         align 4
-clip_setup_qow: 
+.clip_setup_qow:
 
         test    al, 8
-        je      SHORT clip_setup_qow0
+        je      .clip_setup_qow0
 
-        cmp     DWORD PTR [gc+qInfo_mode], 1
-        jne     SHORT clip_setup_oow
+        cmp     dword [gc+qInfo_mode], 1
+        jne     .clip_setup_oow
 
-        mov     ebp, DWORD PTR [gc+qInfo_offset]
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [ebp+vertexPtr]
-        fstp    DWORD PTR [fifo]
+        mov     ebp, dword [gc+qInfo_offset]
+        fld     dword [oowa]
+        fmul    dword [ebp+vertexPtr]
+        fstp    dword [fifo]
         
-        jmp     SHORT clip_setup_oow_inc
+        jmp     .clip_setup_oow_inc
         align 4
-clip_setup_oow:
+.clip_setup_oow:
         
-        mov     ebp, DWORD PTR oowa
+        mov     ebp, dword [oowa]
         
-        mov     DWORD PTR [fifo], ebp
+        mov     dword [fifo], ebp
         align 4
-clip_setup_oow_inc:
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList+4]   
+.clip_setup_oow_inc:
+        mov     ebp, dword [gc+ebx*4+tsuDataList+4]   
         add     fifo, 4
         
         inc     ebx
         align 4
-clip_setup_qow0:
+.clip_setup_qow0:
 
         test    al, 16                  ; 00000010H
-        je      SHORT clip_setup_stow0
+        je      .clip_setup_stow0
 
-        cmp     DWORD PTR [gc+q0Info_mode], 1        
-        jne     SHORT clip_setup_oow0
+        cmp     dword [gc+q0Info_mode], 1        
+        jne     .clip_setup_oow0
                         
-        mov     ebp, DWORD PTR [gc+q0Info_offset]
+        mov     ebp, dword [gc+q0Info_offset]
         
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [ebp+vertexPtr]
-        fstp    DWORD PTR [fifo]
+        fld     dword [oowa]
+        fmul    dword [ebp+vertexPtr]
+        fstp    dword [fifo]
         
-        jmp     SHORT clip_setup_oow0_inc
+        jmp     .clip_setup_oow0_inc
         align 4
-clip_setup_oow0:
-        mov     ebp, DWORD PTR oowa
+.clip_setup_oow0:
+        mov     ebp, dword [oowa]
         
-        mov     DWORD PTR [fifo], ebp
+        mov     dword [fifo], ebp
         align 4
-clip_setup_oow0_inc:
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList+4]
+.clip_setup_oow0_inc:
+        mov     ebp, dword [gc+ebx*4+tsuDataList+4]
         add     fifo, 4
         
         inc     ebx
         align 4
-clip_setup_stow0:
+.clip_setup_stow0:
                         
         test    al, 32
-        je      SHORT clip_setup_qow1
+        je      .clip_setup_qow1
 
         
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [vertexPtr+ebp]
+        fld     dword [oowa]
+        fmul    dword [vertexPtr+ebp]
 
         add     fifo, 8
         add     ebx, 2
 
-        fmul    DWORD PTR [gc+tmu0_s_scale]
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [vertexPtr+ebp+4]
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList]
-        fmul    DWORD PTR [gc+tmu0_t_scale]
+        fmul    dword [gc+tmu0_s_scale]
+        fld     dword [oowa]
+        fmul    dword [vertexPtr+ebp+4]
+        mov     ebp, dword [gc+ebx*4+tsuDataList]
+        fmul    dword [gc+tmu0_t_scale]
         fxch    
-        fstp    DWORD PTR [fifo-8]
-        fstp    DWORD PTR [fifo-4]
+        fstp    dword [fifo-8]
+        fstp    dword [fifo-4]
         
         align 4
-clip_setup_qow1:
+.clip_setup_qow1:
 
         test    al, 64
-        je      SHORT clip_setup_stow1
+        je      .clip_setup_stow1
 
-        cmp     DWORD PTR [gc+q1Info_mode], 1
-        jne     SHORT clip_setup_oow1
+        cmp     dword [gc+q1Info_mode], 1
+        jne     .clip_setup_oow1
 
-        mov     ebp, DWORD PTR [gc+q1Info_offset]
+        mov     ebp, dword [gc+q1Info_offset]
         
-        fld     DWORD PTR [ebp+vertexPtr]
-        fmul    DWORD PTR oowa
-        fstp    DWORD PTR [fifo]
+        fld     dword [ebp+vertexPtr]
+        fmul    dword [oowa]
+        fstp    dword [fifo]
         
-        jmp     SHORT clip_setup_oow1_inc
+        jmp     .clip_setup_oow1_inc
         align 4
-clip_setup_oow1:
-        mov     ebp, DWORD PTR oowa
+.clip_setup_oow1:
+        mov     ebp, dword [oowa]
         
-        mov     DWORD PTR [fifo], ebp
+        mov     dword [fifo], ebp
         align 4
-clip_setup_oow1_inc:
-        mov     ebp, DWORD PTR [gc+ebx*4+tsuDataList+4]
+.clip_setup_oow1_inc:
+        mov     ebp, dword [gc+ebx*4+tsuDataList+4]
         add     fifo, 4
         
         inc     ebx
         align 4
-clip_setup_stow1:
+.clip_setup_stow1:
 
         test    al, 128
-        je      SHORT clip_setup_end
+        je      .clip_setup_end
         
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [vertexPtr+ebp]
+        fld     dword [oowa]
+        fmul    dword [vertexPtr+ebp]
         add     fifo, 8  
-        fmul    DWORD PTR [gc+tmu1_s_scale]
-        fld     DWORD PTR oowa
-        fmul    DWORD PTR [vertexPtr+ebp+4]
-        fmul    DWORD PTR [gc+tmu1_t_scale]
+        fmul    dword [gc+tmu1_s_scale]
+        fld     dword [oowa]
+        fmul    dword [vertexPtr+ebp+4]
+        fmul    dword [gc+tmu1_t_scale]
         fxch    
-        fstp    DWORD PTR [fifo-8]
-        fstp    DWORD PTR [fifo-4]
+        fstp    dword [fifo-8]
+        fstp    dword [fifo-4]
 
         align 4
-clip_setup_end:
+.clip_setup_end:
 
         cmp     edx, 12
-        jl      begin_for_loop
+        jl      .begin_for_loop
         
         align 4
-update_fifo_ptr:        
+.update_fifo_ptr:
 
-        mov     ebx, DWORD PTR [gc+fifoPtr]
-        mov     edx, DWORD PTR [gc+fifoRoom]
+        mov     ebx, dword [gc+fifoPtr]
+        mov     edx, dword [gc+fifoRoom]
         
         sub     ebx, fifo
         mov     eax, 1
@@ -5205,11 +5176,11 @@ update_fifo_ptr:
         add     edx, ebx
         pop     ebp
 
-        mov     DWORD PTR [gc+fifoRoom], edx
+        mov     dword [gc+fifoRoom], edx
         pop     edi
 
-        mov     DWORD PTR [gc+fifoPtr], fifo
-        mov     ebx, DWORD PTR _GlideRoot+trisProcessed
+        mov     dword [gc+fifoPtr], fifo
+        mov     ebx, dword [_GlideRoot+trisProcessed]
 
 ;;; ;   _GlideRoot.stats.trisProcessed++
 
@@ -5217,12 +5188,12 @@ update_fifo_ptr:
         pop     esi
         inc     ebx
 
-        mov     DWORD PTR _GlideRoot+trisProcessed, ebx
+        mov     dword [_GlideRoot+trisProcessed], ebx
         pop     ebx
         
-        ret     12                      ; 0000000cH
+        ret                             ; 0000000cH
         align 4
-__triDone:    
+.__triDone:
         ;; Restore trashed registers
         mov     esi, [_GlideRoot + trisProcessed]
         pop     ebp
@@ -5234,12 +5205,8 @@ __triDone:
         mov     [_GlideRoot + trisProcessed], esi
         
         pop     ebx
-        ret     12
+        ret
 
-_vptrisetup_cull@12 ENDP
-
-_TEXT   ENDS
-endif  ; !GL_AMD3D
-
-END
+endp
 
+%endif  ; !GL_AMD3D
diff --git a/glide3x/cvg/glide3/src/xos.inc b/glide3x/cvg/glide3/src/xos.inc
new file mode 100644
index 0000000..f4964bd
--- /dev/null
+++ b/glide3x/cvg/glide3/src/xos.inc
@@ -0,0 +1,135 @@
+;
+; compulsory header for cvg/glide3/xdraw* assembly specializations (NASM)
+;
+; $Header$
+; $Log$
+; Revision 1.1.2.2  2003/06/13 07:22:59  dborca
+; more fixes to NASM sources
+;
+; Revision 1.1.2.1  2003/06/07 09:53:25  dborca
+; initial checkin for NASM sources
+;
+
+;---------------------------------------
+; platform defines
+;---------------------------------------
+%define XOS_DJGPP 1
+%define XOS_LINUX 2
+%define XOS_WIN32 4
+
+%define STDCALL   0
+
+;---------------------------------------
+; pick up the right OS
+;---------------------------------------
+%ifdef __DJGPP__
+%define XOS XOS_DJGPP
+%elifdef __linux__
+%define XOS XOS_LINUX
+%elifdef __WIN32__
+%define XOS XOS_WIN32
+%define STDCALL 1
+%else
+%error Unknown OS
+%endif
+
+;---------------------------------------
+; general purpose macros
+;---------------------------------------
+%macro	extrn	1-2 0
+	%if (XOS == XOS_WIN32) && STDCALL && (%2 > 0)
+		%define %1 %1@%2
+	%endif
+		extern	%1
+%endmacro
+
+%macro	globl	1-2 0
+	%if (XOS == XOS_WIN32) && STDCALL && (%2 > 0)
+		%define %1 %1@%2
+	%endif
+		global	%1
+%endmacro
+
+%macro		proc	1-2 0
+	%push	proc
+	%if STDCALL && (%2 > 0)
+		%define	%$ret	RET %2
+	%else
+		%define	%$ret	RET
+	%endif
+	globl	%1, %2
+%1:
+%endmacro
+
+%macro		endp	0
+	%ifnctx	proc
+		%error	Mismatched `endp'/`proc'
+	%else
+		%pop
+	%endif
+%endmacro
+
+%macro		ret	0
+	%ifnctx	proc
+		RET
+	%else
+		%$ret
+	%endif
+%endmacro
+
+%macro invoke 1-*
+	%rep	%0 - 1
+	%rotate -1
+	push	%1
+	%endrep
+	%rotate -1
+	call	%1
+%if (STDCALL == 0) && (%0 > 1)
+	add	esp, 4 * (%0 - 1)
+%endif
+%endmacro
+
+;---------------------------------------
+; Windows
+;---------------------------------------
+%if XOS == XOS_WIN32
+
+%define TEXT  .text   align=32
+%define DATA  .data   align=32
+%define CONST .rdata  align=32
+
+%macro GET_GC 0
+	mov	gc, [_GlideRoot + curGC]
+%endmacro
+
+%endif
+
+;---------------------------------------
+; DJGPP
+;---------------------------------------
+%if XOS == XOS_DJGPP
+
+%define TEXT  .text
+%define DATA  .data
+%define CONST .rodata
+
+%macro GET_GC 0
+	mov	gc, [_GlideRoot + curGC]
+%endmacro
+
+%endif
+
+;---------------------------------------
+; Linux
+;---------------------------------------
+%if XOS == XOS_LINUX
+
+%define TEXT  .text   align=32
+%define DATA  .data   align=32
+%define CONST .rodata align=32
+
+%macro GET_GC 0
+	mov	gc, [_GlideRoot + curGC]
+%endmacro
+
+%endif
diff --git a/glide3x/cvg/glide3/src/xtexdl.asm b/glide3x/cvg/glide3/src/xtexdl.asm
index 9520f2c..4ddd746 100644
--- a/glide3x/cvg/glide3/src/xtexdl.asm
+++ b/glide3x/cvg/glide3/src/xtexdl.asm
@@ -19,8 +19,45 @@
 ;; $Header$
 ;; $Revision$
 ;; $Log$
+;; Revision 1.1.8.7  2003/09/12 05:08:35  koolsmoky
+;; preparing for graphic context checks
+;;
+;; Revision 1.1.8.6  2003/07/07 23:29:06  koolsmoky
+;; cleaned logs
+;;
+;;
+;; Revision 1.1  2000/06/15 00:27:43  joseph
+;; Initial checkin into SourceForge.
 ;; 
-;; 1     10/08/98 11:30a Brent
+;; 10    8/17/99 6:35p Atai
+;; fixed amd debug mode
+;; 
+;; 9     4/08/99 1:22p Atai
+;; added contect check for _grTexDownload_3DNow_MMX
+;; 
+;; 8     3/19/99 11:26a Peter
+;; expose direct fifo for gl
+;; 
+;; 7     2/02/99 4:36p Peter
+;; download through lfb rather than texture port
+;; 
+;; 6     12/17/98 2:36p Atai
+;; check in Norbert's fix for texture download width correction
+;; 
+;; 5     12/07/98 11:33a Peter
+;; norbert's re-fixes of my merge
+;; 
+;; 4     11/02/98 5:34p Atai
+;; merge direct i/o code
+;; 
+;; 3     10/20/98 5:34p Atai
+;; added #ifdefs for hwc
+;; 
+;; 2     10/14/98 12:05p Peter
+;; fixed my effed up assumption about non-volatile regs
+;; 
+;; 1     10/09/98 6:48p Peter
+;; 3DNow!(tm) version of wide texture downloads
 ;; 
 ;; 3     10/07/98 9:43p Peter
 ;; triangle procs for 3DNow!(tm)
@@ -32,247 +69,646 @@
 ;; mmx stuff for 3DNow!(tm) capable processors
 ;; 
 
-TITLE   xtexdl.asm
-OPTION OLDSTRUCTS
+%include "xos.inc"
 
-.586P
-.MMX
-.K3D
-    
-EXTRN   __FifoMakeRoom: NEAR
+extrn _FifoMakeRoom
+
+%MACRO _grCommandTransportMakeRoom 3
+    push %3
+    push %2
+    push %1
+    call _FifoMakeRoom
+    add  esp, 12
+%ENDMACRO ; _grCommandTransportMakeRoom
 
 ;;; Definitions of cvg regs and glide root structures.
-INCLUDE fxgasm.h
+%INCLUDE "fxgasm.h"
 
-; Arguments (STKOFF = 12 from 3 dword pushes)
-STACKOFFSET = 12
-_gc$	    =  4 + STACKOFFSET
-_baseAddr$  =  8 + STACKOFFSET
-_maxS$	    = 12 + STACKOFFSET
-_minT$	    = 16 + STACKOFFSET
-_maxT$	    = 20 + STACKOFFSET
-_texData$   = 24 + STACKOFFSET
+; Arguments (STKOFF = 16 from 4 dword pushes)
+STACKOFFSET equ 16
+_gc$        equ  4 + STACKOFFSET
+_baseAddr$  equ  8 + STACKOFFSET
+_maxS$      equ 12 + STACKOFFSET
+_minT$      equ 16 + STACKOFFSET
+_maxT$      equ 20 + STACKOFFSET
+_texData$   equ 24 + STACKOFFSET
 
-    ;; NB: The first set of registers (eax-edx) are volatile across
+    ;; NB: The first set of registers (eax, ecx, and edx) are volatile across
     ;; function calls. The remaining registers are supposedly non-volatile
     ;; so they only store things that are non-volatile across the call.
-fifo	TEXTEQU <eax>		; Current fifo ptr in inner loop
-texAddr TEXTEQU <ebx>		; Physical download address of the current scanline    
-gc	TEXTEQU	<ecx>		; Current graphics context
-curS	TEXTEQU	<edx>		; Current texture scanline
-    
-maxT	TEXTEQU	<esi>		; Max scanline line value (inclusive)
-dataPtr TEXTEQU <edi>		; Current user texture data ptr
-curT	TEXTEQU	<ebp>		; Current s coordinate in inner loop
 
-temp1	TEXTEQU	curS
-temp2	TEXTEQU texAddr
-temp3	TEXTEQU	gc    
-    
-GR_FIFO_WRITE   MACRO __addr, __offset, __data
-    mov    [__addr + __offset], __data
-ENDM ; GR_FIFO_WRITE
+%define fifo    ebp         ; fifo ptr in inner loop
+%define gc      esi         ; graphics context
+%define dataPtr edi         ; pointer to exture data to be downloaded
+%define curT    ebx         ; counter for texture scan lines (t-coordinate)
+%define curS    ecx         ; texture s-coordinate
+%define fRoom   edx         ; room available in fifo (in bytes)
 
 ;--------------------------------------------------------------------------
-_TEXT       SEGMENT PAGE PUBLIC USE32 'CODE'
-            ASSUME DS: FLAT, SS: FLAT
 
-            ALIGN  32
+%IFNDEF GL_SSE2
 
-	    PUBLIC __grTexDownload_3DNow_MMX@24
-__grTexDownload_3DNow_MMX@24 PROC NEAR
+;--------------------------------------------------------------------------
+;
+; GL_AMD3D, GL_MMX
+;
+;--------------------------------------------------------------------------
 
-    ;; Function prologue type things
-    ;; NB:   We are not bothering to preserve the contents
-    ;;	     of eax, ebx, ecx, edx because they are volatile
-    ;;	     by convention.
+segment		TEXT
+
+              ALIGN  32
+
+%IFDEF GL_AMD3D
+proc _grTexDownload_3DNow_MMX, 24
+%ENDIF
+%IFDEF GL_MMX
+proc _grTexDownload_MMX, 24
+%ENDIF
+
+    push      ebx                       ; save caller's register variable
+    mov       curT, [esp + _maxT$ - 12] ; curT = maxT
+
+    push      esi                       ; save caller's register variable
+    mov       eax, [esp + _minT$ - 8]   ; minT
+
+    push      edi                       ; save caller's register variable
+    mov       gc, [esp + _gc$ - 4]      ; gc
+
+    push      ebp                       ; save caller's register variable
+    mov       dataPtr, [esp + _texData$]; dataPtr
+
+%IFDEF GLIDE_ALT_TAB
+    test      gc, gc
+    je        .dlDone
+;    mov       edx, [gc + windowed]
+;    test      edx, 1
+;    jnz       .pastContextTest
+    mov       edx, DWORD [gc+lostContext]
+    mov       ecx, [edx]
+    test      ecx, 1
+    jnz       .dlDone
+;.pastContextTest:
+%ENDIF
+
+    sub       curT, eax                 ; curT = maxT - minT
+    mov       fifo, [gc + fifoPtr]      ; fifoPtr
+
+    mov       curS, [esp + _maxS$]      ; curS = maxS 
+    add       curT, 1                   ; curT = maxT - minT + 1
+
+%IFDEF GL_AMD3D
+    femms                               ; we'll use MMX/3DNow!, make sure FPU register cleared
+%ENDIF
+%IFDEF GL_MMX
+    emms                                ; we'll use MMX
+%ENDIF
+
+    mov       edx, curS                 ; curS = maxS = scanline width in DWORDs
+    movd      mm3, [esp + _baseAddr$]   ; 0 | address of texture to download
+
+    shl       curS, 2                   ; scan line width (in bytes)
+    mov       eax, [esp + _minT$]       ; 0 | minT
+
+    mov       [esp + _maxS$], curS      ; save scan line width (in bytes)
+    shl       edx, 3                    ; packetHdr<21:3> = maxS = scanline width in DWORDs
+
+    imul      eax, curS                 ; TEX_ROW_ADDR_INCR(minT) = minT * TEX_ROW_ADDR_INCR(1)
+
+    movd      mm2, curS                 ; 0 | TEX_ROW_ADDR_INCR(1)
+    or        edx, 00000005h            ; packetHdr<31:30> = lfb port
+                                        ; packetHdr<21:3>  = maxS
+                                        ; packetHdr<2:0>   = packetType 5 
+
+    movd      mm1, edx                  ; 0 | packetHdr
+    movd      mm4, eax                  ; 0 | TEX_ROW_ADDR_INCR(minT)
+
+    psllq     mm2, 32                   ; TEX_ROW_ADDR_INCR(1) | 0
+    paddd     mm3, mm4                  ; 0 | texAddr = texBaseAddr + TEX_ROW_ADDR_INCR(minT)
+
+    mov       fRoom, [gc + fifoRoom]    ; get available fifoRoom (in bytes)
+    punpckldq mm1, mm3                  ; hdr2 = texAddr | hdr1 = packetHdr
+
+    ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = curS = maxS
+    ;; edx = fifoRoom, mm1 = texAddr|packetHdr, mm2 = TEX_ROW_ADDR_INCR(1)|0
+
+    test      fifo, 4                   ; is fifo QWORD aligned ?
+    jz        .startDownload            ; yup, start texture download
+
+    cmp       fRoom, 4                  ; enough room for NULL packet in fifo?
+    jge       .mmxAlignFifo             ; yes, write NULL packet to align fifo
+
+%ifdef USE_PACKET_FIFO
+    _grCommandTransportMakeRoom 4, 0, __LINE__; make fifo room
+%endif
     
-    ;; Enter 3DNow!(tm) state for the duration of the function
-    ;; because we don't use or call anything that uses fp.
-    femms
-    
-    mov	    gc, [esp + _gc$ - STACKOFFSET + 0]
-    push    esi
-    
-    mov	    maxT, [esp + _maxT$ - STACKOFFSET + 4]
-    push    edi
+    mov       fifo, [gc + fifoPtr]      ; fifoPtr modified by _grCommandTransportMakeRoom, reload
 
-    shl	    maxT, 9		; Convert maxT to rowAddr format
-    push    ebp
-        
-    mov	    dataPtr, [esp + _texData$]
-    mov	    curT, [esp + _minT$]    
+    mov       fRoom, [gc + fifoRoom]    ; fifoRoom modified by _grCommandTransportMakeRoom, reload
+    mov       curS, [esp + _maxS$]      ; reload maxS (destroyed by call to _grCommandTransportMakeRoom)
 
-    ;; Pad out fifo so that we can use mmx writes the whole way w/o
-    ;; any intermediate tests in the inner loop for fifo alignment.
-    ;; Conveniently, the packet header is 2 dwords which matches
-    ;; the size of the mmx write.
-    mov	    fifo, [gc + fifoPtr]; Cache fifo ptr
-    mov	    texAddr, [esp + _baseAddr$]; Texture physical address
+    test      fifo, 4                   ; new fifoPtr QWORD aligned ?
+    jz        .startDownload            ; yup, start texture download
 
-    mov	    temp1, [esp + _maxS$]; Pre-convert maxS into packet 5 field format
-    sub	    texAddr, [gc + tex_ptr]; Convert to hw base relative address            
+.mmxAlignFifo:
 
-    shl	    temp1, 2		; Write size dwords -> bytes
-    mov	    [esp + _baseAddr$], texAddr
+    mov       DWORD [fifo], 0           ; write NULL packet
+    sub       fRoom, 4                  ; fifoRoom -= 4
 
-    shl	    curT, 9		; curT = TEX_ROW_ADDR_INCR(curT)      
-    mov	    [esp + _maxS$], temp1; Write back converted s coordinate
+    mov       [gc + fifoRoom], fRoom    ; store new fifoRoom
+    add       fifo, 4                   ; fifoPtr += 4
 
-    shl	    temp1, 1		; Write size to packet 5 field format
-    test    fifo, 4h		; Aligned fifo ptr?
-    
-    mov	    temp2, [gc + fifoRoom]; temp2 = gc->fifoRoom
-    mov	    [esp + _maxT$], temp1; Write back converted field format size
-    
-    jz	    __loopT
+%IFDEF GLIDE_DEBUG
+    mov       [gc + checkPtr], fifo      ; checkPtr
+%ENDIF
 
-    ;; Check to make sure there's room in the fifo. If not then
-    ;; we'll wrap and then it should be aligned for the remainder of
-    ;; this function invocation.
-    cmp	    temp2, 4h
-    jg	    __mmxAlignFifo
+    mov       [gc + fifoPtr], fifo      ; store new fifoPtr
+    jmp       .startDownload            ; fifo aligned, download texture now
 
-    push    @Line		; Line # inside this function
-    push    0h			; NULL file name
+    align 32
 
-    push    4h			; fifo space required
-    call    __FifoMakeRoom	; Get fifo room
+    ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = maxS = curS
+    ;; edx=fifoRoom, mm1 = texAddr|packetHdr, mm2 = TEX_ROW_ADDR_INCR(1)|0
 
-    ;; Calling out to external code means that our registers can get
-    ;; trashed in the same way that we trash things. Plus we need to
-    ;; re-cache the fifoPtr since we may have wrapped.
+.loopT:
 
-    add	    esp, 12		; Pop the 3 DWORDs for the fifoWrap parameters
-    mov	    gc, [esp + _gc$]
+%IFDEF GLIDE_DEBUG
 
-    ;; Setup the regs to do the alignment
-    mov	    fifo, [gc + fifoPtr]
-    test    fifo, 4h
-    
-    mov	    temp2, [gc + fifoRoom]
-    jz	    __loopT
+    ;; Make sure that we have a QWORD aligned fifoPtr; force GP if not aligned
 
-__mmxAlignFifo:	    
-    add	    fifo, 4h		; packetPtr++
-    xor	    temp1, temp1	; Clear the nop packet
-    
-    mov	    [gc + fifoPtr], fifo; gc->fifoPtr = packetPtr
-    sub	    temp2, 4h		; fifoRoom -= 4
+    test      fifo, 4                   ; is fifoPtr QWORD aligned ?
+    jz        .alignmentOK              ; yup, continue
 
-    mov	    [gc + fifoRoom], temp2; gc->fifoRoom = fifoRoom
-    GR_FIFO_WRITE fifo, -4, temp1; NOP packet(0)
+    xor       eax, eax                  ; create 0
+    mov       [eax], eax                ; move to DS:[0] forces GP 
+.alignmentOK:      
+%ENDIF ; GLIDE_DEBUG
 
-	    align 4
-__loopT:
-    ;; Check for room to write the current texture scanline    
-    mov	    temp1, [esp + _maxS$]; temp1 = width of scanline (bytes)
-    mov	    temp2, [gc + fifoRoom]; temp2 = gc->fifoRoom (bytes)
-
-    add	    temp1, 0Ch		; scanline width + sizeof(packet hdr) (bytes) + nop packet to mmx align
-    cmp	    temp2, temp1	; fifo space required >= space availible ?
-    
-    jge	    __dlBegin		; Yes, start download now w/ no more checking
-
-    push    @Line		; Line # inside this function
-    push    0h			; NULL file name
-
-    push    temp1		; fifo space required
-    call    __FifoMakeRoom	; Get fifo room
-
-    add	    esp, 12		; Pop the 3 DWORDs for the fifoWrap parameters
-    nop
-    
-    ;; Calling out to external code means that our registers can get
-    ;; trashed in the same way that we trash things. Plus we need to
-    ;; re-cache the fifoPtr since we may have wrapped.
-    mov	    gc, [esp + _gc$]        
-    mov	    fifo, [gc + fifoPtr]
-
-IFDEF GLIDE_DEBUG
-    ;; Make sure that we have an mmx happy aligned fifoPtr
-    test    fifo, 4
-    jz	    @1
-
-    ;; Fault right away because this would be a huge suck
-    xor	    eax, eax
-    mov	    [eax], eax
-@1:      
-ENDIF ; GLIDE_DEBUG    
-
-    	    align 4
-__dlBegin:
-
-IFDEF GLIDE_DEBUG
-    ;; Make sure that we have an mmx happy aligned fifoPtr
-    test    fifo, 4
-    jz	    @2
-
-    ;; Fault right away because this would be a huge suck
-    xor	    eax, eax
-    mov	    [eax], eax
-@2:      
-ENDIF ; GLIDE_DEBUG
-    
     ;; Compute packet header words
-    ;;	hdr1: downloadSpace[31:30] numWords[21:3] packetType[2:0]
+    ;;  hdr1: downloadSpace[31:30] numWords[21:3] packetType[2:0]
     ;;  hdr2: download address[29:0]
-    mov	    texAddr, [esp + _baseAddr$]; Download base address
-    mov	    temp1, [esp + _maxT$]; Pre-Converted # of words per packet/scanline
-    
-    mov	    temp3, 0C0000005h	; Base packet header (texture port | packet type 5)
-    add	    fifo, 8		; Pre-increment fifo ptr (hdr1)
-    
-    or	    temp3, temp1	; Base packet hdr | # of words
-    add	    texAddr, curT	; texAddr = texBaseAddr + TEX_ROW_ADDR_INCR(curT)
-    
-    GR_FIFO_WRITE fifo, -8, temp3; Write hdr1
-    add	    curT, 200h		; curT += TEX_ROW_ADDR_INCR(1)
-    
-    GR_FIFO_WRITE fifo, -4, texAddr; write hdr2
-    mov	    curS, [esp + _maxS$]; curS = maxS
 
-	    align 4
+    movq      [fifo], mm1               ; store hdr2 | hdr1
+    add       fifo, 8                   ; increment fifo ptr (hdr1 + hdr2)
+
     ;; S coordinate inner loop unrolled for 8 texels a write
-__loopS:        
-    movq    mm0, [dataPtr]	; load (mmx) 64 bit data (8 texels)
-    add	    fifo, 8h		; pre-increment fifoPtr += 2 * sizeof(FxU32)
 
-    add	    dataPtr, 8h		; dataPtr += 2 * sizeof(FxU32)
-    sub	    curS, 8h		; curS -= 2 * sizeof(FxU32)
+.loopS:        
 
-    movq    [fifo - 8], mm0	; *fifoPtr = texelData[64 bits]
-    jnz	    __loopS		; if curS > 0
+    movq      mm0, [dataPtr]            ; load 64 bit data (8 texels)
+    add       fifo, 8                   ; pre-increment fifoPtr += 2 * sizeof(FxU32)
 
-    mov	    gc, [esp + _gc$]	; Re-cache gc which was trashed in the dl loop
-    mov	    temp1, fifo
+    add       dataPtr, 8                ; dataPtr += 2 * sizeof(FxU32)
+    sub       curS, 8                   ; curS -= 2 * sizeof(FxU32)
 
-    ;; Update gc->fifoPtr and gc->fifoRoom for the wrap/stall check
-    mov	    temp2, [gc + fifoPtr]
-    sub	    temp1, temp2	; # of bytes written to the fifo
+    movq      [fifo - 8], mm0           ; *fifoPtr = texelData[64 bits]
+    jnz       .loopS                    ; loop while curS > 0
 
-    mov	    [gc + fifoPtr], fifo; gc->fifoPtr = packetPtr
-    mov	    temp2, [gc + fifoRoom]
+    mov       ecx, [gc + fifoPtr]       ; old fifo ptr
+    nop                                 ; filler
+
+    mov       eax, fifo                 ; new fifo ptr
+    mov       [gc + fifoPtr], fifo      ; save new fifo ptr
+
+%IFDEF GLIDE_DEBUG
+    mov       [gc + checkPtr], fifo      ; checkPtr
+%ENDIF
+
+    sub       eax, ecx                  ; new fifo ptr - old fifo ptr = fifo space used up
+    mov       curS, [esp + _maxS$]      ; curS = maxS = width of scanline (bytes)
+
+    sub       fRoom, eax                ; new fifo space available = old fifo space available - fifo space used up = new fifo space available
+    sub       curT, 1                   ; curT--
+
+    mov       [gc + fifoRoom], fRoom    ; save new fifo space available 
+    jz        .dlDone                   ; loop while curT > 0
+
+    ;; Check for room to write the next texture scanline
+
+    ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo
+    ;; edx = fifoRoom, mm1 = texAddr|packetHdr, mm2 = TEX_ROW_ADDR_INCR(1)|0
+
+    paddd     mm1, mm2                  ; texAddr+=TEX_ROW_ADDR_INCR(1) | packetHdr
+    mov       esp, esp                  ; filler
+.startDownload:
+    lea       eax, [curS+8]             ; fifo space needed = scan line width + header size
+ 
+    cmp       fRoom, eax                ; fifo space available >= fifo space required ?
+    jge       .loopT                    ; yup, write next scan line
+
+%ifdef USE_PACKET_FIFO
+    _grCommandTransportMakeRoom eax, 0, __LINE__; make fifo room (if fifoPtr QWORD aligned before
+%endif
     
-    sub	    temp2, temp1	; # of bytes left in fifo
-    cmp	    curT, maxT		; if (curT <= maxT) ?    
+    mov       fifo, [gc + fifoPtr]      ; fifoPtr was modified by _grCommandTransportMakeRoom, reload
 
-    mov	    [gc + fifoRoom], temp2
-    jle	    __loopT
+    mov       fRoom, [gc + fifoRoom]    ; fifoRoom was modified by _grCommandTransportMakeRoom, reload
+    mov       curS, [esp + _maxS$]      ; curS = maxS = width of scanline (bytes)
+    jmp       .loopT                    ; we now have enough fifo room, write next scanline
 
-__dlDone:	    
-	    align 4
+.dlDone:           
+%IFDEF GL_AMD3D
+    femms                               ; exit 3DNow!(tm) state
+%ENDIF
+%IFDEF GL_MMX
+    emms                                ; exit MMX state
+%ENDIF
 
-    femms			; Exit 3DNow!(tm) state    
-    pop	    ebp
+    pop       ebp                       ; restore caller's register variable
+    pop       edi                       ; restore caller's register variable
     
-    pop	    edi
-    pop	    esi
+    pop       esi                       ; restore caller's register variable
+    pop       ebx                       ; restore caller's register variable
     
-    ret 18h			; Pop 6 parameters and return
+    ret                                 ; pop 6 DWORD parameters and return
+endp
 
-__grTexDownload_3DNow_MMX@24 ENDP
+%ELSE ; !GL_SSE2
 
-_TEXT ENDS
+;--------------------------------------------------------------------------
+;
+; GL_SSE2
+;
+;--------------------------------------------------------------------------
 
-END
+segment		TEXT
+
+              ALIGN  32
+
+proc _grTexDownload_SSE2_64, 24
+
+    push      ebx                       ; save caller's register variable
+    mov       curT, [esp + _maxT$ - 12] ; curT = maxT
+
+    push      esi                       ; save caller's register variable
+    mov       eax, [esp + _minT$ - 8]   ; minT
+
+    push      edi                       ; save caller's register variable
+    mov       gc, [esp + _gc$ - 4]      ; gc
+
+    push      ebp                       ; save caller's register variable
+    mov       dataPtr, [esp + _texData$]; dataPtr
+
+%IFDEF GLIDE_ALT_TAB
+    test      gc, gc
+    je        .dlDone
+;    mov       edx, [gc + windowed]
+;    test      edx, 1
+;    jnz       .pastContextTest
+    mov       edx, DWORD [gc+lostContext]
+    mov       ecx, [edx]
+    test      ecx, 1
+    jnz       .dlDone
+;.pastContextTest:
+%ENDIF
+
+    sub       curT, eax                 ; curT = maxT - minT
+    mov       fifo, [gc + fifoPtr]      ; fifoPtr
+
+    mov       curS, [esp + _maxS$]      ; curS = maxS 
+    add       curT, 1                   ; curT = maxT - minT + 1
+
+    mov       edx, curS                 ; curS = maxS = scanline width in DWORDs
+    movd      xmm3,[esp + _baseAddr$]   ; 0 | 0 | 0 | address of texture to download
+
+    shl       curS, 2                   ; scan line width (in bytes)
+    mov       eax, [esp + _minT$]       ; 0 | 0 | 0 | minT
+
+    mov       [esp + _maxS$], curS      ; save scan line width (in bytes)
+    shl       edx, 3                    ; packetHdr<21:3> = maxS = scanline width in DWORDs
+
+    imul      eax, curS                 ; TEX_ROW_ADDR_INCR(minT) = minT * TEX_ROW_ADDR_INCR(1)
+
+    movd      xmm2,curS                 ; 0 | 0 | TEX_ROW_ADDR_INCR(1)
+    or        edx, 00000005h            ; packetHdr<31:30> = lfb port
+                                        ; packetHdr<21:3>  = maxS
+                                        ; packetHdr<2:0>   = packetType 5 
+
+    movd      xmm1,edx                  ; 0 | 0 | packetHdr
+    movd      xmm4,eax                  ; 0 | 0 | TEX_ROW_ADDR_INCR(minT)
+
+    psllq     xmm2,32                   ; 0 | 0 | TEX_ROW_ADDR_INCR(1) | 0
+    paddd     xmm3,xmm4                 ; 0 | 0 | texAddr = texBaseAddr + TEX_ROW_ADDR_INCR(minT)
+
+    mov       fRoom, [gc + fifoRoom]    ; get available fifoRoom (in bytes)
+    punpckldq xmm1,xmm3                 ; 0 | 0 | hdr2 = texAddr | hdr1 = packetHdr
+
+    ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = curS = maxS
+    ;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
+
+    test      fifo, 4                   ; is fifo QWORD aligned ?
+    jz        .startDownload            ; yup, start texture download
+
+    cmp       fRoom, 4                  ; enough room for NULL packet in fifo?
+    jge       .xmmAlignFifo             ; yes, write NULL packet to align fifo
+
+%ifdef USE_PACKET_FIFO
+    _grCommandTransportMakeRoom 4, 0, __LINE__; make fifo room
+%endif
+    
+    mov       fifo, [gc + fifoPtr]      ; fifoPtr modified by _grCommandTransportMakeRoom, reload
+
+    mov       fRoom, [gc + fifoRoom]    ; fifoRoom modified by _grCommandTransportMakeRoom, reload
+    mov       curS, [esp + _maxS$]      ; reload maxS (destroyed by call to _grCommandTransportMakeRoom)
+
+    test      fifo, 4                   ; new fifoPtr QWORD aligned ?
+    jz        .startDownload            ; yup, start texture download
+
+.xmmAlignFifo:
+
+    mov       DWORD [fifo], 0           ; write NULL packet
+    sub       fRoom, 4                  ; fifoRoom -= 4
+
+    mov       [gc + fifoRoom], fRoom    ; store new fifoRoom
+    add       fifo, 4                   ; fifoPtr += 4
+
+%IFDEF GLIDE_DEBUG
+    mov       [gc + checkPtr], fifo      ; checkPtr
+%ENDIF
+
+    mov       [gc + fifoPtr], fifo      ; store new fifoPtr
+    jmp       .startDownload            ; fifo aligned, download texture now
+
+    align 32
+
+    ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = maxS = curS
+    ;; edx=fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
+
+.loopT:
+
+%IFDEF GLIDE_DEBUG
+
+    ;; Make sure that we have a QWORD aligned fifoPtr; force GP if not aligned
+
+    test      fifo, 4                   ; is fifoPtr QWORD aligned ?
+    jz        .alignmentOK              ; yup, continue
+
+    xor       eax, eax                  ; create 0
+    mov       [eax], eax                ; move to DS:[0] forces GP 
+.alignmentOK:      
+%ENDIF ; GLIDE_DEBUG
+
+    ;; Compute packet header words
+    ;;  hdr1: downloadSpace[31:30] numWords[21:3] packetType[2:0]
+    ;;  hdr2: download address[29:0]
+
+    movq      [fifo],xmm1               ; store hdr2 | hdr1
+    add       fifo, 8                   ; increment fifo ptr (hdr1 + hdr2)
+
+    ;; S coordinate inner loop unrolled for 8 texels a write
+
+.loopS:        
+
+    movq      xmm0,[dataPtr]            ; load 64 bit data (8 texels)
+    add       fifo, 8                   ; pre-increment fifoPtr += 2 * sizeof(FxU32)
+
+    add       dataPtr, 8                ; dataPtr += 2 * sizeof(FxU32)
+    sub       curS, 8                   ; curS -= 2 * sizeof(FxU32)
+
+    movq      [fifo - 8],xmm0           ; *fifoPtr = texelData[64 bits]
+    jnz       .loopS                    ; loop while curS > 0
+
+    mov       ecx, [gc + fifoPtr]       ; old fifo ptr
+    nop                                 ; filler
+
+    mov       eax, fifo                 ; new fifo ptr
+    mov       [gc + fifoPtr], fifo      ; save new fifo ptr
+
+%IFDEF GLIDE_DEBUG
+    mov       [gc + checkPtr], fifo      ; checkPtr
+%ENDIF
+
+    sub       eax, ecx                  ; new fifo ptr - old fifo ptr = fifo space used up
+    mov       curS, [esp + _maxS$]      ; curS = maxS = width of scanline (bytes)
+
+    sub       fRoom, eax                ; new fifo space available = old fifo space available - fifo space used up = new fifo space available
+    sub       curT, 1                   ; curT--
+
+    mov       [gc + fifoRoom], fRoom    ; save new fifo space available 
+    jz        .dlDone                   ; loop while curT > 0
+
+    ;; Check for room to write the next texture scanline
+
+    ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo
+    ;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
+
+    paddd     xmm1,xmm2                 ; 0 | 0 | texAddr+=TEX_ROW_ADDR_INCR(1) | packetHdr
+    mov       esp, esp                  ; filler
+.startDownload:
+    lea       eax, [curS+8]             ; fifo space needed = scan line width + header size
+ 
+    cmp       fRoom, eax                ; fifo space available >= fifo space required ?
+    jge       .loopT                    ; yup, write next scan line
+
+%ifdef USE_PACKET_FIFO
+    _grCommandTransportMakeRoom eax, 0, __LINE__; make fifo room (if fifoPtr QWORD aligned before
+%endif
+    
+    mov       fifo, [gc + fifoPtr]      ; fifoPtr was modified by _grCommandTransportMakeRoom, reload
+
+    mov       fRoom, [gc + fifoRoom]    ; fifoRoom was modified by _grCommandTransportMakeRoom, reload
+    mov       curS, [esp + _maxS$]      ; curS = maxS = width of scanline (bytes)
+    jmp       .loopT                    ; we now have enough fifo room, write next scanline
+
+.dlDone:
+    pop       ebp                       ; restore caller's register variable
+    pop       edi                       ; restore caller's register variable
+    
+    pop       esi                       ; restore caller's register variable
+    pop       ebx                       ; restore caller's register variable
+    
+    ret                                 ; pop 6 DWORD parameters and return
+endp
+
+
+
+segment		TEXT
+
+              ALIGN  32
+
+proc _grTexDownload_SSE2_128, 24
+
+    push      ebx                       ; save caller's register variable
+    mov       curT, [esp + _maxT$ - 12] ; curT = maxT
+
+    push      esi                       ; save caller's register variable
+    mov       eax, [esp + _minT$ - 8]   ; minT
+
+    push      edi                       ; save caller's register variable
+    mov       gc, [esp + _gc$ - 4]      ; gc
+
+    push      ebp                       ; save caller's register variable
+    mov       dataPtr, [esp + _texData$]; dataPtr
+
+%IFDEF GLIDE_ALT_TAB
+    test      gc, gc
+    je        .dlDone
+;    mov       edx, [gc + windowed]
+;    test      edx, 1
+;    jnz       .pastContextTest
+    mov       edx, DWORD [gc+lostContext]
+    mov       ecx, [edx]
+    test      ecx, 1
+    jnz       .dlDone
+;.pastContextTest:
+%ENDIF
+
+    sub       curT, eax                 ; curT = maxT - minT
+    mov       fifo, [gc + fifoPtr]      ; fifoPtr
+
+    mov       curS, [esp + _maxS$]      ; curS = maxS 
+    add       curT, 1                   ; curT = maxT - minT + 1
+
+    mov       edx, curS                 ; curS = maxS = scanline width in DWORDs
+    movd      xmm3,[esp + _baseAddr$]   ; 0 | 0 | 0 | address of texture to download
+
+    shl       curS, 2                   ; scan line width (in bytes)
+    mov       eax, [esp + _minT$]       ; 0 | minT
+
+    mov       [esp + _maxS$], curS      ; save scan line width (in bytes)
+    shl       edx, 3                    ; packetHdr<21:3> = maxS = scanline width in DWORDs
+
+    imul      eax, curS                 ; TEX_ROW_ADDR_INCR(minT) = minT * TEX_ROW_ADDR_INCR(1)
+
+    movd      xmm2,curS                 ; 0 | 0 | 0 | TEX_ROW_ADDR_INCR(1)
+    or        edx, 00000005h            ; packetHdr<31:30> = lfb port
+                                        ; packetHdr<21:3>  = maxS
+                                        ; packetHdr<2:0>   = packetType 5 
+
+    movd      xmm1,edx                  ; 0 | 0 | 0 | packetHdr
+    movd      xmm4,eax                  ; 0 | 0 | 0 | TEX_ROW_ADDR_INCR(minT)
+
+    psllq     xmm2,32                   ; 0 | 0 | TEX_ROW_ADDR_INCR(1) | 0
+    paddd     xmm3,xmm4                 ; 0 | 0 | 0 | texAddr = texBaseAddr + TEX_ROW_ADDR_INCR(minT)
+
+    mov       fRoom, [gc + fifoRoom]    ; get available fifoRoom (in bytes)
+    punpckldq xmm1,xmm3                 ; 0 | 0 | hdr2 = texAddr | hdr1 = packetHdr
+
+    ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = curS = maxS
+    ;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
+
+    test      fifo, 4                   ; is fifo QWORD aligned ?
+    jz        .startDownload            ; yup, start texture download
+
+    cmp       fRoom, 4                  ; enough room for NULL packet in fifo?
+    jge       .xmmAlignFifo             ; yes, write NULL packet to align fifo
+
+%ifdef USE_PACKET_FIFO
+    _grCommandTransportMakeRoom 4, 0, __LINE__; make fifo room
+%endif
+    
+    mov       fifo, [gc + fifoPtr]      ; fifoPtr modified by _grCommandTransportMakeRoom, reload
+
+    mov       fRoom, [gc + fifoRoom]    ; fifoRoom modified by _grCommandTransportMakeRoom, reload
+    mov       curS, [esp + _maxS$]      ; reload maxS (destroyed by call to _grCommandTransportMakeRoom)
+
+    test      fifo, 4                   ; new fifoPtr QWORD aligned ?
+    jz        .startDownload            ; yup, start texture download
+
+.xmmAlignFifo:
+
+    mov       DWORD [fifo], 0           ; write NULL packet
+    sub       fRoom, 4                  ; fifoRoom -= 4
+
+    mov       [gc + fifoRoom], fRoom    ; store new fifoRoom
+    add       fifo, 4                   ; fifoPtr += 4
+
+%IFDEF GLIDE_DEBUG
+    mov       [gc + checkPtr], fifo      ; checkPtr
+%ENDIF
+
+    mov       [gc + fifoPtr], fifo      ; store new fifoPtr
+    jmp       .startDownload            ; fifo aligned, download texture now
+
+    align 32
+
+    ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo, ecx = maxS = curS
+    ;; edx=fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
+
+.loopT:
+
+%IFDEF GLIDE_DEBUG
+
+    ;; Make sure that we have a QWORD aligned fifoPtr; force GP if not aligned
+
+    test      fifo, 4                   ; is fifoPtr QWORD aligned ?
+    jz        .alignmentOK              ; yup, continue
+
+    xor       eax, eax                  ; create 0
+    mov       [eax], eax                ; move to DS:[0] forces GP 
+.alignmentOK:      
+%ENDIF ; GLIDE_DEBUG
+
+    ;; Compute packet header words
+    ;;  hdr1: downloadSpace[31:30] numWords[21:3] packetType[2:0]
+    ;;  hdr2: download address[29:0]
+
+    movq      [fifo],xmm1               ; store hdr2 | hdr1
+    add       fifo, 8                   ; increment fifo ptr (hdr1 + hdr2)
+
+    ;; S coordinate inner loop unrolled for 8 texels a write
+
+.loopS:        
+
+    movdqu    xmm0, [dataPtr]           ; load 128 bit data (8 texels) ; isn't 16 bytes aligned?
+    add       fifo, 16                  ; pre-increment fifoPtr += 4 * sizeof(FxU32)
+
+    add       dataPtr, 16               ; dataPtr += 4 * sizeof(FxU32)
+    sub       curS, 16                  ; curS -= 4 * sizeof(FxU32)
+
+    movdqu    [fifo - 16], xmm0         ; *fifoPtr = texelData[128 bits] ; isn't 16 bytes aligned?
+    jnz       .loopS                    ; loop while curS > 0
+
+    mov       ecx, [gc + fifoPtr]       ; old fifo ptr
+    nop                                 ; filler
+
+    mov       eax, fifo                 ; new fifo ptr
+    mov       [gc + fifoPtr], fifo      ; save new fifo ptr
+
+%IFDEF GLIDE_DEBUG
+    mov       [gc + checkPtr], fifo      ; checkPtr
+%ENDIF
+
+    sub       eax, ecx                  ; new fifo ptr - old fifo ptr = fifo space used up
+    mov       curS, [esp + _maxS$]      ; curS = maxS = width of scanline (bytes)
+
+    sub       fRoom, eax                ; new fifo space available = old fifo space available - fifo space used up = new fifo space available
+    sub       curT, 1                   ; curT--
+
+    mov       [gc + fifoRoom], fRoom    ; save new fifo space available 
+    jz        .dlDone                   ; loop while curT > 0
+
+    ;; Check for room to write the next texture scanline
+
+    ;; ebx = curT, edi = dataPtr, esi = gc, ebp = fifo
+    ;; edx = fifoRoom, xmm1 = texAddr|packetHdr, xmm2 = TEX_ROW_ADDR_INCR(1)|0
+
+    paddd     xmm1,xmm2                 ; 0 | 0 | texAddr+=TEX_ROW_ADDR_INCR(1) | packetHdr
+    mov       esp, esp                  ; filler
+.startDownload:
+    lea       eax, [curS+8]             ; fifo space needed = scan line width + header size
+ 
+    cmp       fRoom, eax                ; fifo space available >= fifo space required ?
+    jge       .loopT                    ; yup, write next scan line
+
+%ifdef USE_PACKET_FIFO
+    _grCommandTransportMakeRoom eax, 0, __LINE__; make fifo room (if fifoPtr QWORD aligned before
+%endif
+    
+    mov       fifo, [gc + fifoPtr]      ; fifoPtr was modified by _grCommandTransportMakeRoom, reload
+
+    mov       fRoom, [gc + fifoRoom]    ; fifoRoom was modified by _grCommandTransportMakeRoom, reload
+    mov       curS, [esp + _maxS$]      ; curS = maxS = width of scanline (bytes)
+    jmp       .loopT                    ; we now have enough fifo room, write next scanline
+
+.dlDone:
+    pop       ebp                       ; restore caller's register variable
+    pop       edi                       ; restore caller's register variable
+    
+    pop       esi                       ; restore caller's register variable
+    pop       ebx                       ; restore caller's register variable
+    
+    ret                                 ; pop 6 DWORD parameters and return
+endp
+
+
+%ENDIF ; GL_SSE2
diff --git a/glide3x/cvg/glide3/tests/Makefile.DJ b/glide3x/cvg/glide3/tests/Makefile.DJ
new file mode 100644
index 0000000..37c680b
--- /dev/null
+++ b/glide3x/cvg/glide3/tests/Makefile.DJ
@@ -0,0 +1,54 @@
+# DOS/DJGPP tests makefile for Glide3
+#
+#  Copyright (c) 2002 - Borca Daniel
+#  Email : dborca@users.sourceforge.net
+#  Web   : http://www.geocities.com/dborca
+#
+# $Header$
+#
+
+
+#
+#  Available options:
+#
+#    Environment variables:
+#	CPU		optimize for the given processor.
+#			default = pentium
+#	DXE=1		use DXE modules.
+#			default = no
+#
+#    Targets:
+#	<file.exe>	build a specific file
+#
+
+
+.PHONY: all
+.SUFFIXES: .c .o .exe
+.SECONDARY: tlib.o
+
+FX_GLIDE_HW = cvg
+TOP = ../../..
+CPU ?= pentium
+
+CC = gcc
+CFLAGS = -Wall -O2 -ffast-math -mcpu=$(CPU)
+CFLAGS += -I$(TOP)/$(FX_GLIDE_HW)/glide3/src -I$(TOP)/$(FX_GLIDE_HW)/incsrc
+CFLAGS += -I$(TOP)/swlibs/fxmisc
+CFLAGS += -D__DOS__ -DCVG
+CFLAGS += -D__DOS32__
+
+LDFLAGS = -s -L$(TOP)/$(FX_GLIDE_HW)/lib
+
+ifdef DXE
+LDLIBS = -lgld3i
+else
+LDLIBS = -lgld3x
+endif
+
+.c.o:
+	$(CC) -o $@ $(CFLAGS) -c $<
+%.exe: tlib.o %.o
+	$(CC) -o $@ $(LDFLAGS) $^ $(LDLIBS)
+
+all:
+	$(error Must specify <filename.exe> to build)
diff --git a/glide3x/cvg/glide3/tests/makefile.linux b/glide3x/cvg/glide3/tests/makefile.linux
index 0ad8911..98b6c7b 100644
--- a/glide3x/cvg/glide3/tests/makefile.linux
+++ b/glide3x/cvg/glide3/tests/makefile.linux
@@ -1,78 +1,48 @@
+# Linux tests makefile for Glide3
 #
-# Insert new header here
+#  Copyright (c) 2002 - Borca Daniel
+#  Email : dborca@users.sourceforge.net
+#  Web   : http://www.geocities.com/dborca
+#
+# $Header$
 #
 
 
-LDIRT= $(wildcard *.exe *.map *.sys *.o *.a)
-
-# Special case rush because its built off of the sst1 tree
-ifeq ($(FX_GLIDE_HW),SST96)
-GLIDE_ROOT      =       $(BUILD_ROOT)/sst1
-else
-GLIDE_ROOT      =       $(BUILD_ROOT)/$(FX_GLIDE_HW)
-endif
+#
+#  Available options:
+#
+#    Environment variables:
+#	CPU		optimize for the given processor.
+#			default = pentium
+#
+#    Targets:
+#	<file.exe>	build a specific file
+#
 
 
-LCINCS  += -I$(BUILD_ROOT)/$(FX_GLIDE_HW)/include
+.PHONY: all
+.SUFFIXES: .c .o .exe
+.SECONDARY: tlib.o
 
-LIBOBJS = tlib.o
+FX_GLIDE_HW = cvg
+TOP = ../../..
+CPU ?= pentium
 
-GLIDELIB        =       -L$(GLIDE_ROOT)/lib -lglide3
+CC = gcc
+CFLAGS = -Wall -O2 -ffast-math -mcpu=$(CPU)
+CFLAGS += -I$(TOP)/$(FX_GLIDE_HW)/glide3/src -I$(TOP)/$(FX_GLIDE_HW)/incsrc
+CFLAGS += -I$(TOP)/swlibs/fxmisc
+CFLAGS += -DCVG
 
-LLDLIBS = $(LIBOBJS) $(GLIDELIB)
+LDFLAGS = -s -L$(TOP)/$(FX_GLIDE_HW)/lib
 
-ifeq ($(HAL_CSIM),1)
-LLDLIBS += $(BUILD_ROOT)/$(FX_GLIDE_HW)/lib/lib$(FX_GLIDE_HW)hal.a
-endif
+LDLIBS = -lglide3
+LDLIBS += -lm
 
-PRIVATE_HEADERS = tlib.h tlib.c tldata.inc
+.c.o:
+	$(CC) -o $@ $(CFLAGS) -c $<
+%.exe: tlib.o %.o
+	$(CC) -o $@ $(LDFLAGS) $^ $(LDLIBS)
 
-CFILES   = test00.c \
-           test01.c \
-           test02.c \
-           test03.c \
-           test04.c \
-           test05.c \
-           test06.c \
-           test07.c \
-           test08.c \
-           test09.c \
-           test10.c \
-           test11.c \
-           test12.c \
-           test13.c \
-           test14.c \
-           test15.c \
-           test16.c \
-           test17.c \
-           test18.c \
-           test19.c \
-           test20.c \
-           test21.c \
-           test22.c \
-           test23.c \
-           test24.c \
-           test25.c \
-           test26.c \
-           test27.c \
-           test28.c \
-           test29.c \
-           test30.c \
-           test31.c \
-           test32.c \
-           test33.c \
-           test34.c \
-           test35.c \
-           test36.c \
-           display.c \
-           sbench.c
-
-PROGRAMS = $(CFILES:.c=)
-
-DATAFILES = alpha.3df decal1.3df lava.3df light.3df matt1.3df miro.3df
-
-include $(BUILD_ROOT)/swlibs/include/make/3dfx.linux.mak
-
-$(PROGRAMS):  $(LIBOBJS)
-
-        
+all:
+	$(error Must specify <filename.exe> to build)
diff --git a/glide3x/cvg/glide3/tests/tlib.c b/glide3x/cvg/glide3/tests/tlib.c
index ceeb56f..46514c2 100644
--- a/glide3x/cvg/glide3/tests/tlib.c
+++ b/glide3x/cvg/glide3/tests/tlib.c
@@ -1636,6 +1636,12 @@ tlErrorMessage( char *err) {
   fprintf(stderr, err);
 } /* tlErrorMessage */
 
+FxU32
+tlGethWnd(void)
+{
+  return -1;
+}
+
 #elif __WIN32__
 
 
diff --git a/glide3x/cvg/init/fxremap.c b/glide3x/cvg/init/fxremap.c
new file mode 100644
index 0000000..1204da4
--- /dev/null
+++ b/glide3x/cvg/init/fxremap.c
@@ -0,0 +1,884 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef __linux__
+#include <conio.h>
+#endif
+#include <3dfx.h>
+#include <fxpci.h>
+
+#define null               0
+#define SIZE_SST1_NEEDED   0x100000
+#define END_ADDRESS        0x10000000
+#define S3_SHIFT           0x400000
+
+struct RangeSTRUCT
+{
+   FxU32                address;
+   FxU32                range;
+   FxU32                id;
+   FxU32                is_voodoo; /* 1 = is voodoo, 2 = hidden SLI */
+   FxU32                is_S3;
+   struct RangeSTRUCT   *next;
+   struct RangeSTRUCT   *prev;
+};
+
+int silent = 1;
+
+typedef struct RangeSTRUCT RangeStruct;
+
+void InitRemap(void);
+void CloseRemap(void);
+void GetMemoryMap(void);
+void RemapVoodoo(RangeStruct *conflict);
+void AdjustMapForS3(void);
+RangeStruct *TestForConflicts(void);
+void RemoveEntry(RangeStruct *del);
+void InsertEntry(RangeStruct *ins);
+FxBool FindHole(RangeStruct *conflict);
+FxU32 SnapToDecentAddress(FxU32 address,RangeStruct *conflict);
+FxBool fits_in_hole(RangeStruct *begin,FxU32 end,RangeStruct *hole,RangeStruct *conflict);
+FxBool fits_under(RangeStruct *first,FxU32 minimum,RangeStruct *hole,RangeStruct *conflict);
+FxU32 pciGetType(long i);
+void pciGetRange(PciRegister reg,FxU32 device_number,FxU32 *data);
+FxBool pciGetAddress(PciRegister reg,FxU32 device_number,FxU32 *data);
+
+void ForceCleanUp(void);
+FxBool FindNecessaryCards(void);
+void ProcessCommandLine(char **argv,int argc);
+FxBool IsCardVoodoo(long i);
+FxBool IsCardS3(long i);
+FxBool ReadHex(char *string,FxU32 *num);
+void AddMapEntry(FxU32 address,FxU32 range,FxU32 id,FxBool VoodooCard,FxBool S3Card);
+void HandleMemoryOverlap(void);
+FxBool overlap_map(RangeStruct *begin,FxU32 end);
+
+FxBool switch_S3_flag_ignore=FXFALSE;
+//FxBool switch_force=FXFALSE;
+FxBool switch_C0_bias=FXTRUE;
+int switch_voodoo_loc = 0;
+FxU32 num_voodoos=0;
+
+//#define TESTING 1
+
+#ifdef TESTING
+RangeStruct test_data[6]=  {{0xF0000000,0x100000,1,0,0,0,0},
+                            {0xF3000000,0x200000,4,1,0,0,0},
+                            {0xF3000000,0x200000,8,0,0,0,0},
+                            {0xF5000000,0x200000,2,0,0,0,0},
+                            {0xE6000000,0x200000,5,0,0,0,0},
+                            {0xD3001000,0x200000,3,0,0,0,0}};
+#endif
+
+RangeStruct map[80];
+RangeStruct hole[80];
+long        num_holes=0;
+RangeStruct *first_entry;
+RangeStruct *last_entry;
+long        entries=0;
+RangeStruct master_hole;
+long        voodoo_loc;
+FxU32       conflicts_found=0;
+
+void fxremap_dowork(int argc,char **argv,int doit_silently)
+{
+   RangeStruct *conflict;
+
+   silent = doit_silently;
+
+   ProcessCommandLine(argv,argc);
+
+
+   
+   InitRemap();
+
+   if (!FindNecessaryCards())
+   {
+      if (!silent) {
+       printf("This program was only meant to be used with the 3dfx Voodoo chipset\n");
+       printf("to correct possible pci address conflicts.\n");
+       printf("No Voodoo chipset was detected\n");
+      }
+      ForceCleanUp();
+   }
+
+   GetMemoryMap();
+
+   /* expand region of mapping for S3 card */
+   AdjustMapForS3();
+   /* see if we find any conflicts with any voodoo card */
+   while (conflict=TestForConflicts())
+   {
+      conflicts_found++;
+      /* since it is going to move */
+      /* remove entry, so we can possibly use it as a hole */
+      RemoveEntry(conflict);
+      if (FindHole(conflict))
+      {
+         conflict->address=master_hole.address;
+         RemapVoodoo(conflict);
+      }
+      else
+      {
+         if (!silent) {
+            printf("Unable to find region to map conflicting board\n");
+         }
+         ForceCleanUp();
+         return;
+      }
+   }
+
+   if (!conflicts_found) {
+      if (!silent) {
+        printf("No conflict with the Voodoo cards was found\n");
+      }
+   }
+   CloseRemap();
+}
+
+void fxremap() {
+  fxremap_dowork(0,NULL,1);
+}
+
+void fxremap_main(int argc,char **argv) {
+  fxremap_dowork(argc,argv,0);
+}
+
+
+void InitRemap(void)
+{
+   pciOpen();
+}
+
+void CloseRemap(void)
+{
+   // pciClose();
+}
+
+FxU32 pciGetConfigData_R(PciRegister reg, FxU32 devNum) {
+   FxU32 data;
+
+   if (pciGetConfigData(reg,devNum,&data) == FXTRUE) {
+      return (data);
+   } else {
+      return (0xFFFFFFFF);
+   }
+}
+
+#define PCI_NORMAL_TYPE 0
+#define PCI_BRIDGE_TYPE 1
+
+void GetMemoryMap(void)
+{
+   FxU32    temp,temp2;
+   FxU32    type;
+   long     devNum;
+   int fn;     /* function number iterator */
+   int maxFnNumber;
+   int multi_fn = 0;
+
+#ifdef TESTING
+   for (i=0;i<6;i++)
+   {
+      temp=test_data[i].address;
+      temp2=~(test_data[i].range - 0x1);
+      AddMapEntry(temp,temp2,test_data[i].id,test_data[i].is_voodoo,test_data[i].is_S3);
+   }
+#else   
+   for (devNum=0;devNum<MAX_PCI_DEVICES;devNum++)
+   {
+      if (pciDeviceExists(devNum))
+      {
+
+         if (pciGetConfigData_R(PCI_HEADER_TYPE,devNum) & (1<<7)) {
+             maxFnNumber = 8; /* multifunction! */
+             multi_fn = 1;
+         } else {
+             multi_fn = 0;
+             if ((pciGetConfigData_R(PCI_VENDOR_ID,devNum) == 0x121a) &&
+                 (pciGetConfigData_R(PCI_DEVICE_ID,devNum) == 0x02)) {
+                 maxFnNumber = 8; /* single board SLI! */
+             } else {
+                 maxFnNumber = 1;
+             }
+         }
+         
+
+         for(fn=0;fn<maxFnNumber;fn++) {
+
+           int i = devNum | (fn << 13); /* add function number */
+           if (pciGetConfigData_R(PCI_VENDOR_ID,i) != 0xFFFF) {
+
+             /* two header types */
+             /* one for bridges and one for everything else */
+             type=pciGetType(i);
+             if (type==PCI_NORMAL_TYPE) {
+               if (pciGetAddress(PCI_BASE_ADDRESS_0,i,&temp)) {
+                 pciGetRange(PCI_BASE_ADDRESS_0,i,&temp2);
+                 AddMapEntry(temp,temp2,i,IsCardVoodoo(i),IsCardS3(i));
+               }
+               if (pciGetAddress(PCI_BASE_ADDRESS_1,i,&temp)) {
+                 pciGetRange(PCI_BASE_ADDRESS_1,i,&temp2);
+                 AddMapEntry(temp,temp2,i,IsCardVoodoo(i),IsCardS3(i));
+               }
+
+#if 0
+               /* Legacy address which is not needed for sst1 type
+                * things w/o 2d.
+                */
+               if (pciGetAddress(PCI_IO_BASE_ADDRESS,i,&temp)) {
+                 pciGetRange(PCI_IO_BASE_ADDRESS,i,&temp2);
+                 AddMapEntry(temp,temp2,i,IsCardVoodoo(i),IsCardS3(i));
+               }
+#endif
+               if (pciGetAddress(PCI_ROM_BASE_ADDRESS,i,&temp)) {
+                 pciGetRange(PCI_ROM_BASE_ADDRESS,i,&temp2);
+                 AddMapEntry(temp,temp2,i,IsCardVoodoo(i),IsCardS3(i));
+               }
+             } else if (type==PCI_BRIDGE_TYPE) {
+               if (pciGetAddress(PCI_BASE_ADDRESS_0,i,&temp)) {
+                 pciGetRange(PCI_BASE_ADDRESS_0,i,&temp2);
+                 AddMapEntry(temp,temp2,i,IsCardVoodoo(i),IsCardS3(i));
+               }
+               if (pciGetAddress(PCI_BASE_ADDRESS_1,i,&temp)) {
+                 pciGetRange(PCI_BASE_ADDRESS_1,i,&temp2);
+                 AddMapEntry(temp,temp2,i,IsCardVoodoo(i),IsCardS3(i));
+               }
+             }
+           } /* if function number exists */
+         } /* for all function numbers */
+      }
+   }
+#endif
+}
+
+void AdjustMapForS3(void)
+{
+   RangeStruct *cur;
+
+   cur=first_entry;
+
+   while(cur)
+   {
+      if (cur->is_S3)
+      {
+         cur->address-=S3_SHIFT;
+         cur->range=S3_SHIFT<<1;
+      }
+      cur=cur->next;
+   }
+}
+
+RangeStruct *TestForConflicts(void)
+{
+   RangeStruct *cur,*next;
+
+   cur=first_entry;
+
+   while(cur)
+   {
+      /* if this is a poorly mapped voodoo2 single board SLI, then remap */
+      if ((cur->is_voodoo == 2) && (cur->address == 0xFF00000)) {
+        return (cur);
+      }
+
+      if (cur->next)
+      {
+         if ((cur->address + cur->range) > cur->next->address)
+         {
+            next=cur->next;
+            if ((cur->is_voodoo)||(next->is_voodoo))
+            {
+               if (cur->is_voodoo)
+               {
+                  return cur;
+               }
+               return next;
+            }
+            else {
+               if (!silent) {
+                 printf("FxRemap: Possible PCI conflict not with Voodoo device\n");
+                 printf("%X (%X) <-> %X:%X (%X)\n",cur->id, cur->address,  
+                       cur->next->id, cur->next->address);
+               }
+            }
+         }
+      }
+      else
+      {
+         if ((cur->address + cur->range) > END_ADDRESS)
+            return cur;
+      }
+      cur=cur->next;
+   }
+   return null;
+}
+
+void AddMapEntry(FxU32 address,FxU32 range,FxU32 id,FxBool VoodooCard,FxBool S3Card)
+{
+   RangeStruct *temp,*cur,*next;
+
+//jcochrane@3dfx.com
+   long        entry=0;
+   FxU32	   tmp_address=0;
+//END
+
+
+#if 0
+   static long    test_entry=0;
+
+   address=test_data[test_entry].address;
+   range=~(test_data[test_entry++].range - 0x1);
+#endif
+   /* only if address != 0 */
+
+
+//jcochrane@3dfx.com
+//check for duplicate entries in the map table,ignore if there is
+
+	tmp_address=address>>4;
+	for(entry=0;entry<entries;entry++)
+	{
+		if( tmp_address == map[entry].address)
+			address=0;
+	}
+//END
+
+
+   if(address)
+   {
+      map[entries].address=address>>4;
+      map[entries].range=((~range)>>4)+0x1;
+
+      map[entries].id=id;
+      map[entries].is_voodoo=VoodooCard;
+      map[entries].is_S3=S3Card;
+
+      temp=&map[entries++];
+      if (entries<=1)
+      {
+         first_entry=temp;
+         last_entry=temp;
+         temp->next=null;
+         temp->prev=null;
+         return;
+      }
+
+      cur=first_entry;
+      next=null;
+      while(cur)
+      {
+         if (temp->address < cur->address)
+         {
+            next=cur;
+            break;
+         }
+         cur=cur->next;
+      }
+      if (next)
+      {
+         temp->next=next;
+         temp->prev=next->prev;
+         next->prev=temp;
+         if (next==first_entry)
+            first_entry=temp;
+         else
+            (temp->prev)->next=temp;
+      }
+      else
+      {
+         last_entry->next=temp;
+         temp->prev=last_entry;
+         last_entry=temp;
+         temp->next=null;
+      }
+   }
+}
+
+void RemoveEntry(RangeStruct *del)
+{
+   RangeStruct *prev;
+
+   if (!(del->next))
+   {
+      if (!(del->prev))
+      {
+         if (!silent) {
+           printf("FxRemap: No entries mapped\n");
+         }
+         ForceCleanUp();
+         return;
+      }
+      prev=del->prev;
+      last_entry=prev;
+
+      prev->next=null;
+      del->prev=null;
+      del->next=null;
+   }
+   else
+   {
+      if (!(del->prev))
+      {
+         del->next->prev=null;
+         first_entry=del->next;
+      }
+      else
+      {
+         del->next->prev=del->prev;
+         del->prev->next=del->next;
+      }
+      del->next=null;
+      del->prev=null;
+   }
+}
+
+void InsertEntry(RangeStruct *ins)
+{
+   RangeStruct *cur;
+
+   cur=first_entry;
+   
+   ins->next=null;
+   ins->prev=null;
+
+   if (!first_entry)
+   {
+      first_entry=ins;
+      last_entry=ins;
+      return;
+   }
+   while(cur)
+   {
+      if (ins->address < cur->address)
+      {
+         ins->next=cur;
+         ins->prev=cur->prev;
+         cur->prev=ins;
+         
+         if (!ins->prev)
+         {
+            first_entry=ins;
+         }
+         else
+         {
+            (ins->prev)->next=ins;
+         }
+         return;
+      }
+      cur=cur->next;
+   }
+
+   /* if it got this far it needs to go at the end */
+   ins->prev=last_entry;
+   last_entry->next=ins;
+   last_entry=ins;
+}
+
+FxU32 SnapToDecentAddress(FxU32 address,RangeStruct *conflict)
+{
+   FxU32 range;
+   FxU32 mask;
+   FxU32 not_mask;
+
+   range=conflict->range;
+   if (range<0x10000)
+      range=0x10000;
+   mask=range;
+   mask-=1;
+   not_mask=~mask;
+
+   if (address & mask)
+   {
+      address=(address & not_mask) + range;
+   }
+   return address;
+}
+
+FxBool fits_in_hole(RangeStruct *begin,FxU32 end,RangeStruct *hole,RangeStruct *conflict)
+{
+   FxU32 address;
+
+   address=begin->address+begin->range;
+
+   address=SnapToDecentAddress(address,conflict);
+
+   /* note could be <= */
+   /* this is safer but more inefficient memory wise */
+   if ((address+conflict->range)<end)
+   {
+      hole->address=address;
+      hole->range=end-address;
+      return FXTRUE;
+   }
+   return FXFALSE;
+}
+
+FxBool fits_under(RangeStruct *first,FxU32 minimum,RangeStruct *hole,RangeStruct *conflict)
+{
+   FxU32 address;
+
+   address=minimum;
+   address=SnapToDecentAddress(address,conflict);
+
+   if ((address+conflict->range) < first->address)
+   {
+      hole->address=address;
+      hole->range=first->address - address;
+      return FXTRUE;
+   }
+   return FXFALSE;
+}
+
+
+FxBool FindHole(RangeStruct *conflict)
+{
+   RangeStruct *cur;
+
+   cur=first_entry;
+
+   while(cur)
+   {
+      if (!(cur->next))
+      {
+         if (fits_in_hole(cur,END_ADDRESS,&master_hole,conflict))
+         {
+            return FXTRUE;
+         }
+      }
+      else
+      {
+         if (fits_in_hole(cur,cur->next->address,&master_hole,conflict))
+         {
+            return FXTRUE;
+         }
+      }
+      cur=cur->next;
+   }
+
+   /* see if we can find a whole located below addressed boards */
+   /* don't want to go below 0xA000000 for addressing our boards */
+   if (first_entry->address > 0xA000000)
+   {
+      if (fits_under(first_entry,0xA000000,&master_hole,conflict))
+         return FXTRUE;
+   }
+   return FXFALSE;
+}
+
+void RemapVoodoo(RangeStruct *conflict)
+{
+   FxU32    address;
+
+   /* put conflict back into memory map */
+   InsertEntry(conflict);
+
+#ifndef TESTING
+   address=(conflict->address)<<4;
+   pciSetConfigData(PCI_BASE_ADDRESS_0,conflict->id,&address);
+#endif
+   if (!silent) {
+     printf("Remapped Voodoo Board to avoid a conflict\n");
+   }
+}
+
+void pciGetRange(PciRegister reg,FxU32 device_number,FxU32 *data)
+{
+   FxU32    temp=0xFFFFFFFF;
+   FxU32    size,save;
+   
+   pciGetConfigData(reg,device_number,&save);
+   pciSetConfigData(reg,device_number,&temp);
+   pciGetConfigData(reg,device_number,&size);
+   pciSetConfigData(reg,device_number,&save);
+
+#ifdef TESTING
+   printf("PciGetRange: save %08x \n",save);
+   printf("PciGetRange: temp %08x \n",temp);
+   printf("PciGetRange: size %08x \n",size);
+   printf("PciGetRange: save %08x \n",save);
+#endif
+   *data=size;
+}
+
+FxBool pciGetAddress(PciRegister reg,FxU32 device_number,FxU32 *data)
+{
+   pciGetConfigData(reg,device_number,data);
+   if ((*data)==0)
+      return FXFALSE;
+   if (*data & 0x01)
+      return FXFALSE;
+   return FXTRUE;
+}
+
+void ForceCleanUp(void)
+{
+   // pciClose();
+   // exit(1);
+}
+
+FxBool FindNecessaryCards(void)
+{
+   FxBool voodoo_found=FXFALSE;
+   long   i;
+
+   for (i=0;i<MAX_PCI_DEVICES;i++)
+   {
+      if (pciDeviceExists(i))
+      {
+         if(IsCardVoodoo(i))
+         {
+            voodoo_found=FXTRUE;
+            num_voodoos++;
+         }
+      }
+   }
+   if (!voodoo_found)
+   {
+      if (!silent) {
+        printf("Warning no known voodoo card was found\n");
+      }
+      return FXFALSE;
+   }
+   return FXTRUE;
+}
+
+void ProcessCommandLine(char **argv,int argc)
+{
+   long     i;
+   FxU32    temp,temp2;
+   FxU32    address,range;
+   char     *hex_ptr;
+   
+   for (i=1;i<argc;i++)
+   {
+      if (strcmp(argv[i],"/dS3")==0)
+      {
+         switch_S3_flag_ignore=FXTRUE;
+      }
+      else if(strcmp(argv[i],"/f")==0)
+      {
+         if ((i+1)<argc)
+         {
+            if (ReadHex(argv[i+1],&temp))
+            {
+               i++;
+            }
+            else
+            {
+               if (!silent) {
+                printf("Command line: improper format\n");
+                printf("ex: fxremap.exe /f 0xC0000000\n");
+               }
+               ForceCleanUp();
+            }
+         }
+         else
+         {
+            if (!silent) {
+              printf("Command line: improper format\n");
+              printf("ex: fxremap.exe /f 0xC0000000\n");
+
+            }
+            ForceCleanUp();
+         }
+         if (!silent) {
+           printf("Command line option /f ignored in this version\n");
+
+         }
+/* this stuff was from the interactive test version */
+#if 0
+         while(!kbhit())
+         {
+            ;
+         }
+         getch();
+#endif
+      }
+      else if(strcmp(argv[i],"/x")==0)
+      {
+         if ((i+1)<argc)
+         {
+            hex_ptr=strchr(argv[i+1],'-');
+            if (!hex_ptr)
+            {
+               if (!silent) {
+                 printf("Command line: improper format\n");
+                 printf("ex: fxremap.exe /x 0xE0000000-0xF0000000\n");
+               }
+               ForceCleanUp();
+            }
+            if ((ReadHex(argv[i+1],&temp))&&(ReadHex(hex_ptr+1,&temp2)))
+            {
+               address=temp;
+               range=temp2-temp;
+               range=~(range - 0x1);
+               i++;
+               AddMapEntry(address,range,0x500,FXFALSE,FXFALSE);
+            }
+            else
+            {
+               if (!silent) {
+                 printf("Command line: improper format\n");
+                 printf("ex: fxremap.exe /x 0xE0000000-0xF0000000\n");
+               }
+               ForceCleanUp();
+            }
+         }
+         else
+         {
+            if (!silent) {
+              printf("Command line: improper format\n");
+              printf("ex: fxremap.exe /x 0xE0000000-0xF0000000\n");
+            }
+            ForceCleanUp();
+         }
+      }
+      else if (strcmp(argv[i],"/nb")==0)
+      {
+         switch_C0_bias=FXFALSE;
+      }
+      else if (strcmp(argv[i],"/i")==0)
+      {
+         switch_voodoo_loc = atoi(argv[++i]);
+      }
+      else
+      {
+         if (!silent) {
+           printf("Command line: improper options specified\n");
+           printf("Valid options are /dS3 /f /x /i\n");
+         }
+      }
+   }
+}
+
+FxU32 pciGetType(long i)
+{
+   FxU32 header_type;
+
+   pciGetConfigData(PCI_HEADER_TYPE,i,&header_type);
+
+   return header_type;
+}
+
+FxBool IsCardVoodoo(long i)
+{
+   FxU32    vendor,dev_id;
+   FxU32    fn_num = (i >> 13) & 0x7; 
+   int      true_val;
+
+   if (fn_num) {
+     true_val = 2;
+   } else {
+     true_val = 1;
+   }
+   
+   pciGetConfigData(PCI_VENDOR_ID,i,&vendor);
+   pciGetConfigData(PCI_DEVICE_ID,i,&dev_id);
+   /* if sst1 */
+   if ((vendor==0x121a)&&(dev_id==0x0001))
+      return FXTRUE;
+   /* if voodoo2 */
+   if ((vendor==0x121a)&&(dev_id==0x0002)) {
+      if (true_val == 2) if (!silent) { printf("found voodoo2 hidden sli\n"); }
+      return true_val;
+   }
+   /* if banshee */
+   if ((vendor==0x121a)&&(dev_id==0x0003))
+      return FXTRUE;
+   /* if h4? or whatever is next */
+   if ((vendor==0x121a)&&(dev_id==0x0004))
+      return FXTRUE;
+   return FXFALSE;
+}
+
+FxBool IsCardS3(long i)
+{
+   FxU32    vendor,dev_id;
+   
+   pciGetConfigData(PCI_VENDOR_ID,i,&vendor);
+   pciGetConfigData(PCI_DEVICE_ID,i,&dev_id);
+   if ((vendor==0x5333)&&((dev_id==0x88f0)||(dev_id==0x8880)))
+      return FXTRUE;
+
+   return FXFALSE;
+}
+
+FxBool ReadHex(char *string,FxU32 *num)
+{
+   long  i=0;
+   FxU32 temp=0,temp2;
+   long  num_count=0;
+
+   /* bypass leading spaces */
+   while((string[i])&&(string[i]==' '))
+      i++;
+   /* verify leading 0x */
+   if (string[i]=='0')
+      i++;
+   else
+      return FXFALSE;
+   if (string[i]=='x')
+      i++;
+   else
+      return FXFALSE;
+
+   /* read in number */
+   while(((string[i]>=0x30)&&(string[i]<0x3A))||((string[i]>=0x41)&&(string[i]<0x47))||((string[i]>=0x61)&&(string[i]<0x67)))
+   {
+      if ((string[i]>=0x30)&&(string[i]<0x3A))
+         temp2=string[i] - 0x30;
+      else if ((string[i]>=0x41)&&(string[i]<0x47))
+         temp2=string[i] - 0x37;
+      else if ((string[i]>=0x61)&&(string[i]<0x67))
+         temp2=string[i] - 0x57;
+      if (num_count!=0)
+         temp=(temp<<4)+temp2;
+      else if (num_count<8)
+         temp=temp2;
+      else
+         return FXFALSE;
+      num_count++;i++;
+   }
+   *num=temp;
+   return FXTRUE;
+}
+
+void HandleMemoryOverlap(void)
+{
+   RangeStruct *cur;
+
+   cur=first_entry;
+   while(cur)
+   {
+      if (cur!=last_entry)
+      {
+         if (overlap_map(cur,cur->next->address))
+         {
+            if (cur->range<(cur->next->address+cur->next->range-cur->address))
+               cur->range=cur->next->address+cur->next->range-cur->address;
+            if (cur->next==last_entry)
+            {
+               last_entry=cur;
+               cur->next=null;
+            }
+            else
+            {
+               cur->next=cur->next->next;
+               cur->next->prev=cur;
+            }
+         }
+         else
+            cur=cur->next;
+      }
+      else
+         cur=cur->next;
+   }
+}
+
+FxBool overlap_map(RangeStruct *begin,FxU32 end)
+{
+   if ((begin->address+begin->range)>end)
+      return FXTRUE;
+   return FXFALSE;
+}
diff --git a/glide3x/cvg/init/fxremap.h b/glide3x/cvg/init/fxremap.h
new file mode 100644
index 0000000..bbd5d03
--- /dev/null
+++ b/glide3x/cvg/init/fxremap.h
@@ -0,0 +1,14 @@
+/*
+ * fxremap.h
+ *
+ * pci remapper, used to remap the single board SLI slave to a valid
+ * PCI address
+ */
+
+#ifndef _FXREMAP_H_
+#define _FXREMAP_H_
+
+void fxremap(void);
+int fxremap_main(int argc, char **argv);
+
+#endif