brender-1997/softrend/subdiv.asm
2022-05-03 14:31:40 -07:00

684 lines
14 KiB
NASM

.586p
.model flat,c
include drv.inc
PROLOGUE MACRO procname, flags, argbytes, localbytes, reglist, userparms:VARARG
push ebp
mov ebp,esp
push ebx
push edi
push esi
exitm <0>
endm
EPILOGUE MACRO procname, flags, argbytes, localbytes, reglist, userparms:VARARG
pop esi
pop edi
pop ebx
pop ebp
ret
endm
OPTION PROLOGUE:PROLOGUE
OPTION EPILOGUE:EPILOGUE
.code
COMPUTE_COMPONENT_MID_POINT_VALUES macro C_P
assume eax:ptr dword, ebx:ptr dword, ecx:ptr dword, edx:ptr dword, edi:ptr dword, esi:ptr dword
; compiler = 27 cycles per component
;15 cycles per component
; 0 1 2 3 4 5 6 7
fld [edx+4*C_P] ; p0
fadd [edi+4*C_P] ; m2*2
fld [edi+4*C_P] ; p1 m2*2
fadd [esi+4*C_P] ; m0*2 m2*2
fxch st(1) ; m2*2 m0*2
fmul fp_half ; m2 m0*2
fld [edx+4*C_P] ; p0 m2 m0*2
fxch st(2) ; m0*2 m2 p0
fmul fp_half ; m0 m2 p0
fxch st(2) ; p0 m2 m0
fadd [esi+4*C_P] ; m1*2 m2 m0
fxch st(1) ; m2 m1*2 m0
fstp [ecx+4*C_P] ; m1*2 m0
fmul fp_half ; m1 m0
fxch st(1) ; m0 m1
fstp [eax+4*C_P] ; m1
fstp [ebx+4*C_P] ;
endm
PROJECT_COMPONENT macro C_P,C_SP
assume ebp:ptr br_renderer
;21 cycles
fld [ebp]._state.cache.comp_scales[4*C_SP] ; s 1/w1 1/w0 1/w2
fmul [eax+4*C_P] ; sc0 1/w1 1/w0 1/w2
fld [ebp]._state.cache.comp_scales[4*C_SP] ; s sc0 1/w1 1/w0 1/w2
fmul [ebx+4*C_P] ; sc1 sc0 1/w1 1/w0 1/w2
fld [ebp]._state.cache.comp_scales[4*C_SP] ; s sc1 sc0 1/w1 1/w0 1/w2
fxch st(2) ; sc0 sc1 s 1/w1 1/w0 1/w2
fmul st,st(4) ; sc0/w0 sc1 s 1/w1 1/w0 1/w2
fxch st(1) ; sc1 sc0/w0 s 1/w1 1/w0 1/w2
;stall
fmul st,st(3) ; sc1/w1 sc0/w0 s 1/w1 1/w0 1/w2
fxch st(1) ; sc0/w0 sc1/w1 s 1/w1 1/w0 1/w2
fadd [ebp]._state.cache.comp_offsets[4*C_SP] ; p0 sc1/w1 s 1/w1 1/w0 1/w2
fxch st(2) ; s sc1/w1 p0 1/w1 1/w0 1/w2
fmul [ecx+4*C_P] ; sc2 sc1/w1 p0 1/w1 1/w0 1/w2
fxch st(1) ; sc1/w1 sc2 p0 1/w1 1/w0 1/w2
fadd [ebp]._state.cache.comp_offsets[4*C_SP] ; p1 sc2 p0 1/w1 1/w0 1/w2
fxch st(2) ; p0 sc2 p1 1/w1 1/w0 1/w2
fstp [eax+4*C_SP] ; sc2 p1 1/w1 1/w0 1/w2
fmul st,st(4) ; sc2/w2 p1 1/w1 1/w0 1/w2
fxch st(1) ; p1 sc2/w2 1/w1 1/w0 1/w2
fstp [ebx+4*C_SP] ; sc2/w2 1/w1 1/w0 1/w2
fadd [ebp]._state.cache.comp_offsets[4*C_SP] ; p2 1/w1 1/w0 1/w2
;stall
;stall
fstp [ecx+4*C_SP] ; 1/w1 1/w0 1/w2
endm
TEST_AND_PROJECT_VERTEX macro reg ; 0 1 2 3 4 5 6 7
local done
mov edx,[reg]
test edx,OUTCODES_ALL
jnz done
fld fp_one ; 1
fdiv [reg+4*C_W] ; 1/w
; wait a long time
fld [reg+4*C_X] ; x 1/w
fmul st,st(1) ; x/w 1/w
fld [reg+4*C_Y] ; y x/w 1/w
fmul st,st(2) ; y/w x/w 1/w
fxch st(2) ; 1/w x/w y/w
;stall
fmul [reg+4*C_Z] ; z/w x/w y/w
fxch st(1) ; x/w z/w y/w
;stall
fmul [ebp]._state.cache.comp_scales[4*C_SX] ; sx/w z/w y/w
fxch st(2) ; y/w z/w sx/w
;stall
fmul [ebp]._state.cache.comp_scales[4*C_SY] ; sy/w z/w sx/w
fxch st(2) ; sx/w z/w sy/w
fadd [ebp]._state.cache.comp_offsets[4*C_SX] ; SX z/w sy/w
fxch st(1) ; z/w SX sy/w
fmul [ebp]._state.cache.comp_scales[4*C_SZ] ; sz/w SX sy/w
fxch st(2) ; sy/w SX sz/w
fadd [ebp]._state.cache.comp_offsets[4*C_SY] ; SY SX sz/w
fxch st(1) ; SX SY sz/w
fstp [reg+4*C_SX] ; SY sz/w
fstp [reg+4*C_SY] ; sz/w
fadd [ebp]._state.cache.comp_offsets[4*C_SZ] ; SZ
;stall
;stall
fstp [reg+4*C_SZ]
done:
endm
OUTCODE_ORDINATE macro lValue,rValue,tableName1,reg0,reg1
;assumes edx contains outcode flags,esi target for discriptor edi,reg0,reg1
;14 cycles - (at least 2.5 wasted cycles)
mov reg0,rValue
mov esi,080000000h
and esi,reg0
mov reg1,lValue
mov edi,080000000h
and reg0,07fffffffh
shr esi,1
and edi,reg1
or esi,edi
mov edi,reg0
and reg1,07fffffffh
;stall
sub edi,reg1
sub reg1,reg0
shr edi,3
and reg1,080000000h
shr reg1,2
or esi,edi
or esi,reg1
lea reg0,tableName1
shr esi,28
;stall
;stall
;stall
mov reg0,[reg0+4*esi]
;stall
xor edx,reg0
endm
DOT_PRODUCT_4 macro reg0,reg1,dest
assume reg0:ptr dword
assume reg1:ptr dword
;18 cycles
; 0 1 2 3 4 5 6 7
fld [reg0] ; u0
fmul [reg1] ; u0*v0
fld [reg0+4] ; u1 u0*v0
fmul [reg1+4] ; u1*v1 u0*v0
fld [reg0+8] ; u2 u1*v1 u0*v0
fmul [reg1+8] ; u2*v2 u1*v1 u0*v0
fld [reg0+12] ; u3 u2*v2 u1*v1 u0*v0
fmul [reg1+12] ; u3*v3 u2*v2 u1*v1 u0*v0
fxch st(3) ; u0*v0 u2*v2 u1*v1 u3*v3
faddp st(2),st ; u2*v2 p0+p1 u3*v3
;stall ;
faddp st(2),st ; p0+p1 p2+p3
;stall
;stall
faddp st(1),st ; prod
;stall
;stall
fstp dest ;
endm
ifdef BASED_FLOAT
if BASED_FLOAT
averageVerticesOnScreen proc renderer:dword, m0:dword, m1:dword, m2:dword, v0:dword, v1:dword, v2:dword
mov edi,v1
mov esi,v2
mov ecx,m2
mov edx,v0
mov eax,m0
mov ebx,m1
COMPUTE_COMPONENT_MID_POINT_VALUES C_X
COMPUTE_COMPONENT_MID_POINT_VALUES C_Y
COMPUTE_COMPONENT_MID_POINT_VALUES C_Z
COMPUTE_COMPONENT_MID_POINT_VALUES C_W
;get the projection divide going
;41 cycles
; 0 1 2 3 4 5 6 7
fld [eax+4*C_W] ; w0
fmul [ebx+4*C_W] ; w1w0
fld [ebx+4*C_W] ; w1 w1w0
fmul [ecx+4*C_W] ; w1w2 w1w0
fld st(1) ; w1w0 w1w2 w1w0
fmul [ecx+4*C_W] ; w2w1w0 w1w2 w1w0
fld [eax+4*C_W] ; w0 w2w1w0 w1w2 w1w0
fmul [ecx+4*C_W] ; w2w0 w2w1w0 w1w2 w1w0
fxch st(1) ; w2w1w0 w2w0 w1w2 w1w0
fdivr fp_one ; 1/w2w1w0 w2w0 w1w2 w1w0
;perform a check to see if we can avoid doing any work.
;doesn't need to be fast as its hidden by the divide
push ebp
mov ebp,rend.block
assume ebp:ptr brp_block
mov ebp,[ebp].convert_mask_f
and ebp,CM_R or CM_G or CM_B
pop ebp
mov ebp,renderer
jz commonComponents
COMPUTE_COMPONENT_MID_POINT_VALUES C_R
COMPUTE_COMPONENT_MID_POINT_VALUES C_G
COMPUTE_COMPONENT_MID_POINT_VALUES C_B
commonComponents:
COMPUTE_COMPONENT_MID_POINT_VALUES C_I
COMPUTE_COMPONENT_MID_POINT_VALUES C_U
COMPUTE_COMPONENT_MID_POINT_VALUES C_V
;complete projection
; 0 1 2 3 4 5 6 7
;5 cycles
fmul st(2),st ; 1/w2w1w0 w2w0 1/w0 w1w0
;stall
fmul st(1),st ; 1/w2w1w0 1/w1 1/w0 w1w0
;stall
fmulp st(3),st ; 1/w1 1/w0 1/w2
PROJECT_COMPONENT C_X,C_SX
PROJECT_COMPONENT C_Y,C_SY
PROJECT_COMPONENT C_Z,C_SZ
;3 cycles
fstp st(0)
fstp st(0)
fstp st(0)
ret
averageVerticesOnScreen endp
endif
endif
subdivideCheck proc uses ebx ecx edx,
v0 : ptr brp_vertex,
v1 : ptr brp_vertex,
v2 : ptr brp_vertex
assume eax: ptr brp_vertex, ebx: ptr brp_vertex, ecx: ptr brp_vertex
if BASED_FIXED
; Fetch the absolute values of the homogenous Zs
;
; 6-9 cycles + 0-12 cycles branch misprediction
;
mov eax,v0
mov ebx,v1
mov ecx,v2
mov eax,[eax].comp_f[C_Z*4]
mov ebx,[ebx].comp_f[C_Z*4]
mov ecx,[ecx].comp_f[C_Z*4]
test eax,eax
jns v0_zpos
neg eax
v0_zpos:
test ebx,ebx
jns v1_zpos
neg ebx
v1_zpos:
test ecx,ecx
jns v2_zpos
neg ecx
v2_zpos:
; get the order
;
; 4
;
xor edx,edx
cmp ebx,eax
rcl edx,1
cmp ecx,eax
rcl edx,1
cmp ecx,ebx
rcl edx,1
; If the ratio of the smallest and largest Z is greater
; than a threshold, then subdivide
;
; 17 + 0/4
;
fild rend.subdivide_threshold ; t
mov eax,sort_table_2[edx*4]
mov ebx,sort_table_0[edx*4]
mov eax,[v0+eax]
mov ebx,[v0+ebx]
fild [eax].comp_f[C_Z*4] ; zmax t
fild [ebx].comp_f[C_Z*4] ; zmin zmax t
fxch st(1) ; zmax zmin t
fmulp st(2),st(0) ; zmin zmax*t
fmul fp_one_fixed ; zmin*1 zmax*t
fxch st(1) ; zmax*t zmin*1
fstp temp
fstp temp2
mov eax,temp
mov ebx,temp2
and eax,07fffffffh
and ebx,07fffffffh
cmp eax,ebx
jge subdivide
mov eax,BR_FALSE
ret
subdivide:
mov eax,BR_TRUE
ret
endif ; BASED_FIXED
if BASED_FLOAT
; Sort the vertices in order of absolute value of homogeneous Z values
;
mov eax,v0
mov ebx,v1
mov ecx,v2
mov eax,[eax].comp_f[C_Z*4]
mov ebx,[ebx].comp_f[C_Z*4]
mov ecx,[ecx].comp_f[C_Z*4]
and eax,07fffffffh
and ebx,07fffffffh
and ecx,07fffffffh
xor edx,edx
cmp ebx,eax
rcl edx,1
cmp ecx,eax
rcl edx,1
cmp ecx,ebx
rcl edx,1
; If the ratio of the smallest and largest Z is greater
; than a threshold, then subdivide
;
fld rend.subdivide_threshold
mov eax,sort_table_2[edx*4]
mov ebx,sort_table_0[edx*4]
mov eax,[v0+eax]
mov ebx,[v0+ebx]
fmul [eax].comp_f[C_Z*4]
fstp temp
mov eax,temp
mov ebx,[ebx].comp_f[C_Z*4]
and eax,07fffffffh
and ebx,07fffffffh
cmp eax,ebx
jge subdivide
mov eax,BR_FALSE
ret
subdivide:
mov eax,BR_TRUE
ret
endif ; BASED_FLOAT
subdivideCheck endp
ifdef BASED_FLOAT
if BASED_FLOAT
averageVertices proc renderer:dword, m0:dword, m1:dword, m2:dword, v0:dword, v1:dword, v2:dword
; There are a few probs with this code
; 1) Untested with user defined clip planes.
mov edi,v1
mov esi,v2
mov ecx,m2
mov edx,v0
mov eax,m0
mov ebx,m1
;perform averaging.
COMPUTE_COMPONENT_MID_POINT_VALUES C_X
COMPUTE_COMPONENT_MID_POINT_VALUES C_Y
COMPUTE_COMPONENT_MID_POINT_VALUES C_Z
COMPUTE_COMPONENT_MID_POINT_VALUES C_W
COMPUTE_COMPONENT_MID_POINT_VALUES C_R
COMPUTE_COMPONENT_MID_POINT_VALUES C_G
COMPUTE_COMPONENT_MID_POINT_VALUES C_B
COMPUTE_COMPONENT_MID_POINT_VALUES C_I
COMPUTE_COMPONENT_MID_POINT_VALUES C_U
COMPUTE_COMPONENT_MID_POINT_VALUES C_V
;outcoding
mov edx,OUTCODES_NOT
mov eax,m0
OUTCODE_ORDINATE [eax+4*C_X],[eax+4*C_W],rightLeftTable,ebx,ecx
OUTCODE_ORDINATE [eax+4*C_Y],[eax+4*C_W],topBottomTable,ebx,ecx
OUTCODE_ORDINATE [eax+4*C_Z],[eax+4*C_W],hitherYonTable,ebx,ecx
mov [eax],edx
mov edx,OUTCODES_NOT
mov eax,m1
OUTCODE_ORDINATE [eax+4*C_X],[eax+4*C_W],rightLeftTable,ebx,ecx
OUTCODE_ORDINATE [eax+4*C_Y],[eax+4*C_W],topBottomTable,ebx,ecx
OUTCODE_ORDINATE [eax+4*C_Z],[eax+4*C_W],hitherYonTable,ebx,ecx
mov [eax],edx
mov edx,OUTCODES_NOT
mov eax,m2
OUTCODE_ORDINATE [eax+4*C_X],[eax+4*C_W],rightLeftTable,ebx,ecx
OUTCODE_ORDINATE [eax+4*C_Y],[eax+4*C_W],topBottomTable,ebx,ecx
OUTCODE_ORDINATE [eax+4*C_Z],[eax+4*C_W],hitherYonTable,ebx,ecx
mov [eax],edx
push ebp
mov ecx,m2
mov eax,m0
mov ebx,m1
mov ebp,renderer
; perform clip plane outcoding if neccessary.
mov edx,scache.user_clip_active
mov edi,MAX_STATE_CLIP_PLANES-1
test edx,edx
jz clipDone
assume ebp:ptr br_renderer
clipPlane:
lea ebp,[ebp]._state.clip+MAX_STATE_CLIP_PLANES*sizeof(state_clip)
assume ebp:ptr state_clip
cmp esi,BRT_PLANE
jne clipNext
lea edx,[ebp].plane
lea esi,[eax+4*C_X]
DOT_PRODUCT_4 esi,edx,dotProduct
mov edx,dotProduct
mov esi,[eax]
test edx,080000000h
jz clip1
mov edx,OUTCODE_USER or OUTCODE_N_USER
push ecx
mov ecx,edi
shl edx,cl
pop ecx
xor esi,edx
mov [eax],esi
clip1:
lea esi,[ebx+4*C_X]
DOT_PRODUCT_4 esi,edx,dotProduct
mov edx,dotProduct
mov esi,[ebx]
test edx,080000000h
jz clip1
mov edx,OUTCODE_USER or OUTCODE_N_USER
push ecx
mov ecx,edi
shl edx,cl
pop ecx
xor esi,edx
mov [ebx],esi
clip2:
lea esi,[ecx+4*C_X]
DOT_PRODUCT_4 esi,edx,dotProduct
mov edx,dotProduct
mov esi,[ecx]
test edx,080000000h
jz clip1
mov edx,OUTCODE_USER or OUTCODE_N_USER
push ecx
mov ecx,edi
shl edx,cl
pop ecx
xor esi,edx
mov [ecx],esi
clipNext:
sub ebp,sizeof(state_clip)
dec edi
jge clipPlane
clipDone:
;project if neccesary
mov edx,[eax]
pop ebp
mov edi,[ebx]
mov esi,[ecx]
mov ebp,renderer
test edx,OUTCODES_ALL
jnz selectiveProjection
test edi,OUTCODES_ALL
jnz selectiveProjection
test esi,OUTCODES_ALL
jnz selectiveProjection
;41 cycles
; 0 1 2 3 4 5 6 7
fld [eax+4*C_W] ; w0
fmul [ebx+4*C_W] ; w1w0
fld [ebx+4*C_W] ; w1 w1w0
fmul [ecx+4*C_W] ; w1w2 w1w0
fld st(1) ; w1w0 w1w2 w1w0
fmul [ecx+4*C_W] ; w2w1w0 w1w2 w1w0
fld [eax+4*C_W] ; w0 w2w1w0 w1w2 w1w0
fmul [ecx+4*C_W] ; w2w0 w2w1w0 w1w2 w1w0
fxch st(1) ; w2w1w0 w2w0 w1w2 w1w0
fdivr fp_one ; 1/w2w1w0 w2w0 w1w2 w1w0
;complete projection
; 0 1 2 3 4 5 6 7
;5 cycles
fmul st(2),st ; 1/w2w1w0 w2w0 1/w0 w1w0
;stall
fmul st(1),st ; 1/w2w1w0 1/w1 1/w0 w1w0
;stall
fmulp st(3),st ; 1/w1 1/w0 1/w2
PROJECT_COMPONENT C_X,C_SX
PROJECT_COMPONENT C_Y,C_SY
PROJECT_COMPONENT C_Z,C_SZ
;3 cycles
fstp st(0)
fstp st(0)
fstp st(0)
ret
selectiveProjection:
TEST_AND_PROJECT_VERTEX eax
TEST_AND_PROJECT_VERTEX ebx
TEST_AND_PROJECT_VERTEX ecx
ret
averageVertices endp
endif
endif
.data
sort_table_1 dword 1*4
dword 2*4
dword 0*4
sort_table_0 dword 0*4
dword 0*4
dword 0*4
sort_table_2 dword 2*4
dword 1*4
dword 0*4
dword 1*4
dword 2*4
dword 0*4
dword 0*4
dword 0*4
temp dword 0
temp2 dword 0
fp_one dword 1.0f
fp_half dword 0.5f
fp_one_fixed dword 65536.0
dotProduct dword ?
OUTCODE_TABLE macro name,gENameUCase,lNameUCase
name&Table dword OUTCODE_&gENameUCase or OUTCODE_N_&gENameUCase
dword OUTCODE_&gENameUCase or OUTCODE_N_&gENameUCase
dword 0
dword 0
dword OUTCODE_&gENameUCase or OUTCODE_N_&gENameUCase
dword OUTCODE_&gENameUCase or OUTCODE_N_&gENameUCase
dword OUTCODE_&gENameUCase or OUTCODE_N_&gENameUCase or OUTCODE_&lNameUCase or OUTCODE_N_&lNameUCase
dword 0
dword 0
dword OUTCODE_&lNameUCase or OUTCODE_N_&lNameUCase
dword 0
dword 0
dword OUTCODE_&gENameUCase or OUTCODE_N_&gENameUCase or OUTCODE_&lNameUCase or OUTCODE_N_&lNameUCase
dword OUTCODE_&lNameUCase or OUTCODE_N_&lNameUCase
dword OUTCODE_&gENameUCase or OUTCODE_N_&gENameUCase or OUTCODE_&lNameUCase or OUTCODE_N_&lNameUCase
dword 0
endm
ifdef BASED_FLOAT
if BASED_FLOAT
OUTCODE_TABLE rightLeft,RIGHT,LEFT
OUTCODE_TABLE topBottom,TOP,BOTTOM
OUTCODE_TABLE hitherYon,HITHER,YON
endif
endif
end