brender-1997/pentprim/xzuv1.inc
2022-05-03 14:31:40 -07:00

1272 lines
26 KiB
PHP

;; tzuvlm.inc
;;
;; Triangle rasterise loop Z, Affine Texture, MMX
;;
DITHER_BILINEAR = 0
; Select a bunch of instructions based on the direction that we are going
;
ifidni direction,<lr>
D_PADDW macro _arg0,_arg1
paddw _arg0,_arg1
endm
D_PADDD macro _arg0,_arg1
paddd _arg0,_arg1
endm
WORD_STEP = 8
J_EMPTY macro lab
jg lab
endm
START_MASKS equ left_masks
END_MASKS equ right_masks
M0 = 0
M1 = 1
M2 = 2
M3 = 3
mm_t0 equ <mm0>
mm_t1 equ <mm1>
mm_01l equ <mm0>
mm_01r equ <mm1>
mm_t2 equ <mm1>
mm_t3 equ <mm2>
mm_23l equ <mm1>
mm_23r equ <mm2>
endif
ifidni direction,<rl>
D_PADDW macro _arg0,_arg1
psubw _arg0,_arg1
endm
D_PADDD macro _arg0,_arg1
psubd _arg0,_arg1
endm
J_EMPTY macro lab
jl lab
endm
START_MASKS equ right_masks
END_MASKS equ left_masks
WORD_STEP = -8
M0 = 3
M1 = 2
M2 = 1
M3 = 0
mm_t0 textequ <mm0>
mm_t1 textequ <mm1>
mm_01l textequ <mm1>
mm_01r textequ <mm0>
mm_t2 textequ <mm2>
mm_t3 textequ <mm0>
mm_23l textequ <mm0>
mm_23r textequ <mm2>
endif
; Use function pointer field in header to accumulate transparency mask
;
temp_mask equ WORK.h.function
sub esp,PARAM_OFFSET
; Unpack U&V (with optional tiling), to use 32 bit accumulators
;
;; Unpack U & V
;;
; If going right to left, flip d_x direction and advance start position
; to end of 4 pixel boundary
;
ifidni direction,<rl>
mov eax,PARAM.s_u
mov ebx,PARAM.s_v
mov ecx,PARAM.d_u_x
mov edx,PARAM.d_v_x
add eax,ecx
add ebx,edx
add eax,ecx
add ebx,edx
add eax,ecx
add ebx,edx
neg ecx
neg edx
mov PARAM.s_u,eax
mov PARAM.s_v,ebx
mov PARAM.d_u_x,ecx
mov PARAM.d_v_x,edx
endif
; Find the shifts for the texture and tile size, and generate masks for each part
;
xor eax,eax
xor ebx,ebx
xor ecx,ecx
mov bl,PARAM.tinfo.width_s
mov cl,PARAM.tinfo.height_s
mov al,PARAM.tinfo.tile_s
add ecx,ebx
add ebx,eax
movq mm5,qword ptr uv_masks[eax*8] ; Tile mask
movq mm3,qword ptr uv_masks[ebx*8] ; U mask
movq mm6,mm5
movq mm4,qword ptr uv_masks[ecx*8] ; V mask
movq mm7,mm3
pandn mm7,mm4 ; Clear low bits of V mask
pandn mm6,mm3 ; Clear low bits of U mask
sub ebx,eax ; Adjust V shift
; mm5 = V low integer mask (2 copies)
; mm6 = U integer mask (2 copies)
; mm7 = V high integer mask (2 copies)
;
; eg: (256x256 with 2 bit tile)
;
; mm5: 00000000 00000000 00110000 00000000
; mm6: 00000000 00111111 11000000 00000000
; mm7: 00111111 11000000 00000000 00000000
;
PACK_U macro bridge:=<1>
movd mm4,eax
movq mm1,mm0 ; make copies of values
psrad mm0,1 ; align fraction bits
pslld mm1,mm4 ; shift integer part up
pand mm0,qword ptr fraction_mask ; Mask of fraction bits
pand mm1,mm6 ; Mask out integer bits
por mm0,mm1 ; Merge integer and fraction
if bridge
por mm0,qword ptr fraction_bit ; set bridging bits
por mm0,mm5
endif
endm
PACK_V macro bridge:=<1>
movd mm4,ebx
movq mm1,mm0 ; make copies of values
movq mm2,mm0
psrad mm0,1 ; align fraction bits
pslld mm1,mm4 ; shift integer part up
pand mm0,qword ptr fraction_mask ; Mask of fraction bits
pand mm1,mm7 ; Mask out high integer bits
pand mm2,mm5 ; mask out low integer bits
por mm0,mm1 ; Merge integer and fraction
por mm0,mm2
if bridge
por mm0,qword ptr fraction_bit ; set bridging bits
por mm0,mm6
endif
endm
;; U
;;
movd mm0,PARAM.d_u_y1 ; Load dy1
punpckldq mm0,qword ptr PARAM.d_u_y0 ; and dy0
PACK_U
movq mm1,mm0
punpckldq mm0,mm0
punpckhdq mm1,mm1
movq qword ptr WORK.d_u_y1,mm0
movq qword ptr WORK.d_u_y0,mm1
movd mm0,PARAM.d_u_x
punpckldq mm0,mm0
pslld mm0,2
PACK_U
movd WORK.d_u_x,mm0
movd mm0,PARAM.s_u ; Replicate start value into mm4
pxor mm1,mm1
movd mm3,PARAM.d_u_x
punpckldq mm0,mm0
punpckldq mm1,mm3 ; Load dx into hi word of mm0
punpckldq mm3,mm3
paddd mm1,mm1
paddd mm0,mm1 ; Adjust the second start pixel's value
paddd mm3,mm0
PACK_U 0
movq qword ptr WORK.u0,mm0
movq mm0,mm3
PACK_U 0
movq qword ptr WORK.u1,mm0
;; V
;;
movd mm0,PARAM.d_v_y1 ; Load dy1
punpckldq mm0,qword ptr PARAM.d_v_y0 ; and dy0
PACK_V
movq mm1,mm0
punpckldq mm0,mm0
punpckhdq mm1,mm1
movq qword ptr WORK.d_v_y1,mm0
movq qword ptr WORK.d_v_y0,mm1
movd mm0,PARAM.d_v_x
punpckldq mm0,mm0
pslld mm0,2
PACK_V
movd WORK.d_v_x,mm0
movd mm0,PARAM.s_v ; Replicate start value into mm4
pxor mm1,mm1
movd mm3,PARAM.d_v_x
punpckldq mm0,mm0
punpckldq mm1,mm3 ; Load dx into hi word of mm0
punpckldq mm3,mm3
paddd mm1,mm1
paddd mm0,mm1 ; Adjust the second start pixel's value
paddd mm3,mm0
PACK_V 0
movq qword ptr WORK.v0,mm0
movq mm0,mm3
PACK_V 0
movq qword ptr WORK.v1,mm0
; Build U & V masks
;
if DITHER_BILINEAR
movq mm0,mm5
movq mm1,mm6
por mm0,fraction_bit
por mm1,fraction_bit
movd WORK.u_bridge_bits,mm0
movd WORK.v_bridge_bits,mm1
endif
por mm7,mm5
por mm6,qword ptr fraction_mask
por mm7,qword ptr fraction_mask
movd WORK.u_mask,mm6
movd WORK.v_mask,mm7
if SCREENDOOR
UNPACK_SCREENDOOR_ALPHA
endif
UNPACK_PARAM_16 PARAM.s_z,PARAM.d_z_x, WORK.z0, WORK.d_z_y1, WORK.d_z_y0, WORK.d_z_x
;; Setup for first iteration of loop and generate pointers to starting scanline
;;
movd mm5,WORK.h.start_scanline
;V
movq mm6,qword ptr WORK.h.screen_stride
punpckldq mm5,mm5
if SCREENDOOR or DITHER or DITHER_BILINEAR
mov ebp,WORK.h.start_scanline
endif
pmaddwd mm5,mm6
mov eax,WORK.h.xm
mov ebx,WORK.h.x1
sar eax,16
mov ecx,WORK.h.counts
paddd mm5,qword ptr WORK.h.screen_address
sar ebx,16
if SCREENDOOR or DITHER or DITHER_BILINEAR
and ebp,3
mov WORK.h.start_scanline,ebp
endif
movq qword ptr WORK.h.screen_address,mm5
mov edi,WORK.h.screen_address
test ecx,ecx
mov esi,WORK.h.depth_address
jns start
; Top trapezoid is empty
;
rol ecx,16
; V
test ecx,ecx
js skip_triangle
mov WORK.h.counts,ecx
mov ecx,WORK.h.d_x2
mov ebx,WORK.h.x2
mov WORK.h.d_x1,ecx
mov WORK.h.x1,ebx
sar ebx,16
jmp start
skip_triangle:
RASTERISE_EXIT
;; Swith to lower trapezoid
;;
next_trapezoid_1:
rol ecx,16
next_trapezoid_2:
test ecx,ecx ; See if there is another count of bottom of dword
js skip_triangle
;sl_bottom:
mov ebx,WORK.h.d_x2
; Version of X increments that loads lower X info
;
; Increment X's
;
mov WORK.h.counts,ecx
mov eax,WORK.h.xm
mov WORK.h.d_x1,ebx
mov ecx,WORK.h.d_xm
mov edx,eax
add eax,ecx
mov WORK.h.xm,eax
xor edx,ecx
xor edx,eax
mov ebx,WORK.h.x2
shl edx,14
jmp sl_bottom_cont
; Jump here for empty or skipped line
;
next_line:
mov eax,WORK.h.xm ; Load up edges for next loop
mov ebx,WORK.h.x1
mov ecx,WORK.h.counts
;V
sub ecx,10000h ; Decrement count (in hi word of dword)
js next_trapezoid_1
;; Per scanline updates
;;
sl_loop:
; Increment X's and generate flag for carry from bit 17->18 (crossing dword)
;
mov WORK.h.counts,ecx
mov ecx,WORK.h.d_xm
mov edx,eax
add eax,ecx
mov WORK.h.xm,eax
xor edx,ecx
xor edx,eax
mov ecx,WORK.h.d_x1
; Cycle
shl edx,14
lea ebx,[ebx+ecx]
sl_bottom_cont:
sbb edx,edx
mov WORK.h.x1,ebx
; eax: start X (pixel)
; ebx: end Y (pixel)
;
; edx: -1 or 0 (if carry)
; Increment parameters
;
movq mm0,qword ptr WORK.u0
movq mm1,qword ptr WORK.v0
movq mm2,qword ptr WORK.u1
movq mm3,qword ptr WORK.v1
paddd mm0,qword ptr WORK.d_u_y0[edx*8]
paddd mm1,qword ptr WORK.d_v_y0[edx*8]
paddd mm2,qword ptr WORK.d_u_y0[edx*8]
paddd mm3,qword ptr WORK.d_v_y0[edx*8]
movd mm4,WORK.u_mask
punpckldq mm4,mm4
movd mm5,WORK.v_mask
punpckldq mm5,mm5
pand mm0,mm4
pand mm1,mm5
pand mm2,mm4
pand mm3,mm5
movq qword ptr WORK.u0,mm0
movq qword ptr WORK.v0,mm1
movq qword ptr WORK.u1,mm2
movq qword ptr WORK.v1,mm3
movq mm5,qword ptr WORK.d_z_y0[edx*8]
;V
movq mm0,qword ptr WORK.z0
movq mm1,qword ptr WORK.z2
paddd mm0,mm5
paddd mm1,mm5
movq qword ptr WORK.z0,mm0
;V
movq qword ptr WORK.z2,mm1
;V
; Increment addresses
;
if DITHER or SCREENDOOR or DITHER_BILINEAR
mov ebp,WORK.h.start_scanline
inc ebp
and ebp,3
mov WORK.h.start_scanline,ebp
endif
mov edi,WORK.h.screen_address
mov esi,WORK.h.depth_address
add edi,WORK.h.screen_stride
add esi,WORK.h.depth_stride
sar eax,16 ; Get integer part of X start
mov WORK.h.screen_address,edi
sar ebx,16 ; Get integer part of X end
mov WORK.h.depth_address,esi
start:
; Generate masks and pointers
;
cmp eax,ebx
J_EMPTY next_line ; No pixels on line at all
if SCREENDOOR
; Lookup up screendoor transparency mask
;
mov ecx,WORK.h._c
or ebp,ecx
endif
mov ecx,eax
mov edx,ebx
if SCREENDOOR
movq mm6,qword ptr screendoor_masks[ebp*8]
endif
and eax,not 3
and ebx,not 3
and ecx,3
and edx,3
sub eax,ebx
je one_word ; Scanline fits in one word
;; Render scanline
;;
;; Z Pass
;;
if SCREENDOOR
movq qword ptr screendoor_mask,mm6
endif
; Setup for Z
;
add eax,eax ; convert pixel number to address offset
mov ebp,offset scanline_mask
lea esi,[esi+ebx*2] ; Move pointers on to end of scanline
lea edi,[edi+ebx*2]
; Z read test, and write - 2.6 cycles per pixel
;
; Depth test and writeback for a scanline of 2 words or more
;
; At entry:
;
; mm0 Z01 16.16 16.16
; mm1 Z23 16.16 16.16
;
; ebp pointer to end of mask buffer
; esi pointer to end of scanline in depth buffer
; eax -count
;
; ecx pixel within first word of scanline start
; edx pixel within last word of scanline end
;
; Loop head
;
;
movq mm2,mm0 ; Make a copy of Z01
movq mm3,mm1 ; Make a copy of Z23
D_PADDD mm0,qword ptr WORK.d_z_x ; Add delta to Z01
psrad mm2,16 ; Shift to get integer parts
D_PADDD mm1,qword ptr WORK.d_z_x ; Add delta to Z23
psrad mm3,16
mov ebx,eax ; Copy of loop counter for second pass
packssdw mm2,mm3 ; merge Zs down to 16 bits per pixel
movq mm7,[esi+eax] ; read current z buffer pixels
movq mm6,mm2 ; make copy of new Z's
ifidni direction,<lr>
sub ebp,eax ; Set up mask pointer for left to right
endif
psubusw mm6,mm7 ; Compare old and new Z's (Unsigned)
pcmpeqw mm6,qword ptr zeros ; Make a mask from the results of PSUB
pxor mm5,mm5
pand mm6,qword ptr START_MASKS[ecx*8] ; Mask for front of scanline
;V
if SCREENDOOR
pand mm6,qword ptr screendoor_mask
endif
movq mm4,mm6 ; Make a copy of mask
pand mm2,mm6 ; Select new Z's using mask
pandn mm6,mm7 ; Select old Z's using mask
add eax,WORD_STEP ; Loop control
por mm6,mm2 ; Combine old and new Z's
je z_loop_tail
z_loop_body:
; Loop body
;
; At this point:
;
; mm0 current Z01
; mm1 current Z23
; mm4 previous mask
; mm5 previous accumulated mask
; mm6 previous merged pixels
;
; edi end of line
; eax current count
;
movq mm2,mm0 ; Make copies of the current Z01 values
movq mm3,mm1 ; Make copies of the current Z23 values
D_PADDD mm0,qword ptr WORK.d_z_x ; Add delta to Z01
psrad mm2,16 ; Shift to get integer parts of Z01
D_PADDD mm1,qword ptr WORK.d_z_x ; Add delta to Z23
psrad mm3,16 ; Shift to get integer parts of Z23
movq mm7,[esi+eax] ; read current z buffer pixels
packssdw mm2,mm3 ; merge Zs down to 16 bits per pixel
movq [esi+eax-WORD_STEP],mm6 ; Write to prev. Z buffer pixels
movq mm6,mm2 ; make copy of new Z's
; 10 cycle stall here if destination is not in cache
;
movq [ebp+eax-WORD_STEP],mm4 ; Write out previous mask
psubusw mm6,mm7 ; Compare old and new Z's (Unsigned)
pcmpeqw mm6,qword ptr zeros ; Make a mask from the results of PSUB
por mm5,mm4 ; Accumulate previous mask
if SCREENDOOR
pand mm6,qword ptr screendoor_mask
endif
movq mm4,mm6 ; Make a copy of mask
pand mm2,mm6 ; Select new Z's using mask
pandn mm6,mm7 ; Select old Z's using mask
add eax,WORD_STEP
por mm6,mm2 ; Combine old and new Z's
jne z_loop_body
; Loop tail
;
z_loop_tail:
movq mm7,[esi] ; read current z buffer pixels
psrad mm0,16 ; Shift to get integer parts of Z01
movq [esi-WORD_STEP],mm6 ; Write to prev. Z buffer pixels
psrad mm1,16 ; Shift to get integer parts of Z23
movq [ebp-WORD_STEP],mm4 ; Write out previous mask
packssdw mm0,mm1 ; merge Zs down to 16 bits per pixel
movq mm6,mm0 ; make copy of new Z's
;UV
psubusw mm6,mm7 ; Compare old and new Z's (Unsigned)
;UV
pcmpeqw mm6,qword ptr zeros ; Make a mask from the results of PSUB
;V
pand mm6,qword ptr END_MASKS[edx*8] ; Mask for front of scanline
por mm5,mm4 ; Accumulate previous mask
if SCREENDOOR
pand mm6,qword ptr screendoor_mask
endif
movq mm4,mm6 ; Make a copy of mask
pand mm0,mm6 ; Select new Z's using mask
pandn mm6,mm7 ; Select old Z's using mask
movq mm7,qword ptr mask_6 ; Set up a constant mask in mm7
por mm6,mm0 ; Combine old and new Z's
movq [ebp],mm4 ; Write out mask
por mm5,mm4 ; Accumulate mask
movq [esi],mm6 ; Write to Z buffer pixels
packsswb mm5,mm5
movd eax,mm5
;V
test eax,eax ; Early out of no pixels visible on scanline
je next_line
;; Colour Pass
;;
;; MANY WORD CASE
;;
; MMX Linear texture mapping, 4 pixels at a time, with transparency
;
; mm0 = pixels 0 & 1
; mm1 = pixels 2 & 3
; mm2 = scratch
; mm3 = scratch
; mm4 = U0.U1
; mm5 = V0.V1
; mm6 = U2.U3
; mm7 = V2.V3
;
mov WORK.mask_end,ebp
mov WORK.screen_end,edi
mov eax,ebx
xor edx,edx
xor ecx,ecx
if DITHER_BILINEAR
mov ebx,WORK.h.start_scanline
movd mm2,WORK.u_bridge_bits
pxor mm5,mm5
movq mm4,dither_table_11x[ebx*8]
movq mm6,mm4
movd mm3,WORK.v_bridge_bits
punpcklwd mm4,mm5
punpckhwd mm6,mm5
movq mm5,mm4
movq mm7,mm6
punpckldq mm2,mm2
punpckldq mm3,mm3
por mm4,mm2
por mm6,mm2
por mm5,mm3
por mm7,mm3
paddd mm4,WORK.u0
paddd mm6,WORK.u1
paddd mm5,WORK.v0
paddd mm7,WORK.v1
; XXX Mask
movq mm0,WORK.u_mask
movq mm1,mm0
punpckldq mm0,mm0
punpckhdq mm1,mm1
pand mm4,mm0
pand mm5,mm1
pand mm6,mm0
pand mm7,mm1
else
movq mm4,qword ptr WORK.u0
movq mm5,qword ptr WORK.v0
movq mm6,qword ptr WORK.u1
movq mm7,qword ptr WORK.v1
endif
mov esi,WORK.texture_address
movq mm1,mm6 ; Copy of V's
c_loop_body:
mov edi,WORK.map_address
movq mm0,mm4 ; Copy of U's
paddd mm0,mm5 ; Merge Us and Vs to get texel offsets
paddd mm1,mm7
movq mm2,qword ptr WORK.d_u_x
psrld mm0,FRACTION_BITS+1 ; Get integer part of texel offsets
movq mm3,mm2
psrld mm1,FRACTION_BITS+1 ; Get integer part of texel offsets
movd ebx,mm0
punpckldq mm2,mm2
movd ebp,mm1
punpckhdq mm3,mm3
mov dl,[esi+ebx] ; Read Texel 0
psrlq mm0,32
mov cl,[esi+ebp] ; Read Texel 1
psrlq mm1,32
mov byte ptr temp_mask+M0,dl
paddd mm4,mm2
mov byte ptr temp_mask+M1,cl
paddd mm6,mm2
movq mm2,qword ptr WORK.u_mask
paddd mm5,mm3
movd ebx,mm0
paddd mm7,mm3
movd ebp,mm1
movq mm3,mm2
movd mm_t0,[edi+edx*4]
punpckldq mm2,mm2
movd mm_t1,[edi+ecx*4]
punpckhdq mm3,mm3
mov dl,[esi+ebx] ; Read Texel 2
punpcklbw mm_01l,mm_01r
mov cl,[esi+ebp] ; Read Texel 3
mov byte ptr temp_mask+M2,dl
pand mm4,mm2
pand mm6,mm2
movd mm_t2,[edi+edx*4]
pand mm5,mm3
movd mm_t3,[edi+ecx*4]
pand mm7,mm3
mov byte ptr temp_mask+M3,cl
punpcklbw mm_23l,mm_23r
; Now have the four texels and transparency mask
;
; mm0 a0.a1.r0.r1.g0.g1.b0.b1
; mm1 a2.a3.r2.r3.g2.g3.b2.b3
;
; Unpack to:
; mm0 = R3.R2.R1.R0
; mm1 = G3.G2.G1.G0
; mm2 = B3.B2.B1.B0
; mm3 = M3.M2.M1.M0
;
if DITHER
movd mm3,temp_mask ; Get the 4 texel indices
movq mm2,mm0
mov ebx,WORK.h.start_scanline
punpcklwd mm0,mm1
mov edi,WORK.screen_end
punpckhwd mm2,mm1
movq mm1,mm0
punpcklbw mm2,mm2 ; Unpack blues
paddusw mm2,qword ptr dither_table_11[ebx*8]
punpcklbw mm0,mm0 ; Unpack reds
punpckhbw mm1,mm1 ; Unpack greens
paddusw mm0,qword ptr dither_table_11[ebx*8]
paddusw mm1,qword ptr dither_table_10[ebx*8]
punpcklbw mm3,mm3 ; Convert mask bytes to words
pcmpeqw mm3,qword ptr zeros ; mm3 now has the transparency mask for 4 texels
pand mm2,qword ptr mask_5
if COLOUR_TYPE_15
pand mm1,qword ptr mask_5
endif
if COLOUR_TYPE_15
pand mm1,qword ptr mask_6
endif
psrlw mm0,11
if COLOUR_TYPE_15
psrlw mm2,1
endif
mov esi,WORK.mask_end
por mm0,mm2
else
movd mm3,temp_mask ; Get the 4 texel indices
movq mm2,mm0
pcmpeqb mm3,qword ptr zeros ; mm4 now has the transparency mask for 4 texels
punpcklwd mm0,mm1
mov edi,WORK.screen_end
punpckhwd mm2,mm1
movq mm1,mm0
punpcklbw mm2,mm2 ; Unpack blues
pand mm2,qword ptr mask_5
punpcklbw mm0,mm0 ; Unpack reds
mov esi,WORK.mask_end
punpckhbw mm1,mm1 ; Unpack greens
if COLOUR_TYPE_15
pand mm1,qword ptr mask_5
endif
if COLOUR_TYPE_16
pand mm1,qword ptr mask_6
endif
psrlw mm0,11
if COLOUR_TYPE_15
psrlw mm2,1
endif
punpcklbw mm3,mm3 ; Convert mask bytes to words
por mm0,mm2
endif
pandn mm3,[esi+eax] ; Merge transp with current mask
if COLOUR_TYPE_15
psrlw mm1,6
endif
if COLOUR_TYPE_16
psrlw mm1,5
endif
movq mm2,[edi+eax] ; Read old destination
por mm0,mm1
pand mm0,mm3
pandn mm3,mm2
por mm0,mm3
test eax,eax
mov esi,WORK.texture_address
movq mm1,mm6 ; Copy of V's
movq [edi+eax],mm0
lea eax,[eax+WORD_STEP]
jne c_loop_body
mov ecx,WORK.h.counts
mov eax,WORK.h.xm ; Load up edges for next loop
mov ebx,WORK.h.x1
sub ecx,10000h ; Decrement count (in hi word of dword)
jns sl_loop
rol ecx,16
jmp next_trapezoid_2
one_word:
; Depth test and writeback for a scanline of 1 word
;
; At entry:
;
; mm0 Z01 16.16 16.16
; mm1 Z23 16.16 16.16
;
; esi pointer to depth buffer word
; edi pointer to word in colour buffer
;
; ecx pixel within word of scanline start
; edx pixel within word of scanline end
;
movq mm7,[esi+ebx*2] ; read current z buffer pixels
psrad mm0,16 ; Shift to get integer parts
if SCREENDOOR
pand mm6,qword ptr START_MASKS[ecx*8] ; Mask for front of scanline
else
movq mm6,qword ptr START_MASKS[ecx*8] ; Mask for front of scanline
endif
psrad mm1,16
pand mm6,qword ptr END_MASKS[edx*8] ; Mask for back of scanline
packssdw mm0,mm1 ; merge Zs down to 16 bits per pixel
movq mm1,mm0 ; make copy of new Z's
psubusw mm0,mm7 ; Compare old and new Z's (Unsigned)
pcmpeqw mm0,qword ptr zeros ; Make a mask from the results of PSUB
pand mm0,mm6 ; Mask for length scanline
movq mm5,mm0 ; copy mask for early out test
pand mm1,mm0 ; Select new Z's using mask
pandn mm0,mm7 ; Select old Z's using mask
por mm0,mm1 ; Combine old and new Z's
movq mm6,mm5 ; copy mask for colour pass
movq [esi+ebx*2],mm0 ; Write to Z buffer pixels
packsswb mm5,mm5 ; Merge mask down to 32 bits
movd eax,mm5
test eax,eax ; Early out of no pixels visible on scanline
je next_line
;; ONE WORD CASE
;;
;; mm6 = pixel mask
mov eax,ebx
if DITHER_BILINEAR
mov ebx,WORK.h.start_scanline
movd mm2,WORK.u_bridge_bits
pxor mm5,mm5
movq mm0,dither_table_11x[ebx*8]
movq mm1,mm0
movd mm3,WORK.v_bridge_bits
punpcklwd mm0,mm5
punpckhwd mm1,mm5
movq mm5,mm0
movq mm7,mm1
punpckldq mm2,mm2
punpckldq mm3,mm3
por mm0,mm2
por mm1,mm2
por mm5,mm3
por mm7,mm3
paddd mm0,WORK.u0
paddd mm1,WORK.u1
paddd mm5,WORK.v0
paddd mm7,WORK.v1
; XXX Mask
movq mm2,WORK.u_mask
movq mm3,mm2
punpckldq mm2,mm2
punpckhdq mm3,mm3
pand mm0,mm2
pand mm5,mm3
pand mm1,mm2
pand mm7,mm3
paddd mm0,mm5
paddd mm1,mm7
else
movq mm0,qword ptr WORK.u0
movq mm1,qword ptr WORK.u1
paddd mm0,qword ptr WORK.v0
paddd mm1,qword ptr WORK.v1
endif
psrld mm0,FRACTION_BITS+1 ; Get integer part of texel offsets
mov esi,WORK.texture_address
psrld mm1,FRACTION_BITS+1 ; Get integer part of texel offsets
movd ebx,mm0
movd ebp,mm1
mov dl,[esi+ebx] ; Read Texel 0
psrlq mm0,32
mov cl,[esi+ebp] ; Read Texel 1
psrlq mm1,32
mov edi,WORK.map_address
mov byte ptr temp_mask+M0,dl
mov byte ptr temp_mask+M1,cl
movd ebx,mm0
movd ebp,mm1
movd mm_t0,[edi+edx*4]
movd mm_t1,[edi+ecx*4]
mov dl,[esi+ebx] ; Read Texel 2
punpcklbw mm_01l,mm_01r
mov cl,[esi+ebp] ; Read Texel 3
mov byte ptr temp_mask+M2,dl
movd mm_t2,[edi+edx*4]
movd mm_t3,[edi+ecx*4]
mov byte ptr temp_mask+M3,cl
punpcklbw mm_23l,mm_23r
; Now have the four texels and transparency mask
;
; mm0 a0.a1.r0.r1.g0.g1.b0.b1
; mm1 a2.a3.r2.r3.g2.g3.b2.b3
;
; Unpack to:
; mm0 = R3.R2.R1.R0
; mm1 = G3.G2.G1.G0
; mm2 = B3.B2.B1.B0
; mm3 = M3.M2.M1.M0
;
mov edi,WORK.h.screen_address
if DITHER
movd mm3,temp_mask ; Get the 4 texel indices
movq mm2,mm0
mov ebx,WORK.h.start_scanline
punpcklwd mm0,mm1
lea edi,[edi+eax*2]
punpckhwd mm2,mm1
movq mm1,mm0
punpcklbw mm2,mm2 ; Unpack blues
paddusw mm2,qword ptr dither_table_11[ebx*8]
punpcklbw mm0,mm0 ; Unpack reds
punpckhbw mm1,mm1 ; Unpack greens
paddusw mm0,qword ptr dither_table_11[ebx*8]
paddusw mm1,qword ptr dither_table_10[ebx*8]
pand mm2,qword ptr mask_5
;V
pcmpeqb mm3,qword ptr zeros ; mm3 now has the transparency mask for 4 texels
if COLOUR_TYPE_15
psrlw mm2,1
endif
else
movd mm3,temp_mask ; Get the 4 texel indices
movq mm2,mm0
pcmpeqb mm3,qword ptr zeros ; mm4 now has the transparency mask for 4 texels
punpcklwd mm0,mm1
lea edi,[edi+eax*2]
punpckhwd mm2,mm1
movq mm1,mm0
punpcklbw mm2,mm2 ; Unpack blues
pand mm2,qword ptr mask_5
punpcklbw mm0,mm0 ; Unpack reds
punpckhbw mm1,mm1 ; Unpack greens
if COLOUR_TYPE_15
psrlw mm2,1
endif
endif
if COLOUR_TYPE_15
pand mm1,qword ptr mask_5
endif
if COLOUR_TYPE_16
pand mm1,qword ptr mask_6
endif
psrlw mm0,11
punpcklbw mm3,mm3 ; Convert mask bytes to words
por mm0,mm2
pandn mm3,mm6 ; Merge transp with current mask
if COLOUR_TYPE_15
psrlw mm1,6
endif
if COLOUR_TYPE_16
psrlw mm1,5
endif
movq mm2,[edi] ; Read old destination
por mm0,mm1
pand mm0,mm3
pandn mm3,mm2
mov ecx,WORK.h.counts
por mm0,mm3
mov eax,WORK.h.xm ; Load up edges for next loop
mov ebx,WORK.h.x1
movq [edi],mm0
sub ecx,10000h ; Decrement count (in hi word of dword)
jns sl_loop
rol ecx,16
jmp next_trapezoid_2