it's very slow...
Code: Select all
#Macro drawbackground(dy, dx)
Select Case As Const PPU.nametable
Case 0: calcx = dx: calcy = dy
Case 1: calcx = dx + 256: calcy = dy
Case 2: calcx = dx: calcy = dy + 240
Case 3: calcx = dx + 256: calcy = dy + 240
End Select
calcx = (calcx + PPU.hscroll) Mod 512
calcy = (calcy + PPU.vscroll) Mod 480
If (calcx < 256) And (calcy < 240) Then
usent = 0
ElseIf (calcx > 255) And (calcy < 240) Then
usent = 1
ElseIf (calcx < 256) And (calcy > 239) Then
usent = 2
ElseIf (calcx > 255) And (calcy > 239) Then
usent = 3
End If
calcx = calcx Mod 256
calcy = calcy Mod 240
tmpcalcx = calcx Mod 8
ntaddr = ntmap(usent)
ntval = VRAM(ntaddr + ((calcy And 248) Shl 2) + (calcx Shr 3))
patternoffset = PPU.bgtable + (ntval Shl 4) + (calcy Mod 8)
patternoffset = chrbank(patternoffset Shr 10) + (patternoffset Mod 1024)
usebank = patternoffset Shr 10
usebankmod = patternoffset Mod 1024
pixel = (CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1
patternoffset = patternoffset + 8
usebank = patternoffset Shr 10
usebankmod = patternoffset Mod 1024
pixel = pixel + (((CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1) Shl 1)
attribx = calcx Shr 5: attriby = calcy Shr 5
attribbyte = VRAM(ntaddr + &h3C0 + (attriby Shl 3) + attribx)
attribbyte = attribbyte Shr (((calcx And 16) Shr 3) Or ((calcy And 16) Shr 2)) And 3
pixel = pixel + (attribbyte Shl 2)
backgnd(dx) = VRAM(&H3F00 + pixel)
#EndMacro
i've rewritten it in (admittedly somewhat sloppy) mostly assembly as follows, but it's still pretty slow. this routine is the biggest, by far, bottleneck in the program. even with this asm, you need a good 2+ GHz CPU to run full speed 60 FPS!
Code: Select all
#Macro drawbackground(dry, drx)
VRAMptr = @VRAM(0)
ntmapptr = @ntmap(0)
calcx = drx: calcy = dry
Select Case As Const PPU.nametable
Case 1: 'calcx = dx + 256: calcy = dy
Asm
mov eax, [calcx]
Add eax, 256
mov [calcx], eax
End Asm
Case 2: 'calcx = dx: calcy = dy + 240
Asm
mov eax, [calcy]
Add eax, 240
mov [calcy], eax
End Asm
Case 3: 'calcx = dx + 256: calcy = dy + 240
Asm
mov eax, [calcx]
Add eax, 256
mov [calcx], eax
mov eax, [calcy]
Add eax, 240
mov [calcy], eax
End Asm
End Select
temp1 = PPU.hscroll
temp2 = PPU.vscroll
Asm
mov eax, [calcx]
Add eax, [temp1]
And eax, 511
mov [calcx], eax
mov eax, [calcy]
Add eax, [temp2]
mov ebx, 480
Xor edx, edx
div eax, ebx
mov [calcy], edx
'End Asm
'this code replaces: usent = (calcx Shr 8) + (calcy Shr 8)*2
mov eax, [calcx]
Shr eax, 8
mov ebx, [calcy]
Shr ebx, 8
Shl ebx, 1
Or eax, ebx
mov [usent], eax
Shl ebx, 3
mov eax, [calcy]
Sub eax, ebx
mov [calcy], eax
'this code replaces:
'calcx = calcx Mod 256
'calcy = calcy Mod 240
mov eax, [calcx]
And eax, 255
mov [calcx], eax
mov eax, [calcy]
mov ebx, 240
Xor edx, edx
div eax, ebx
mov [calcy], edx
'this code replaces: tmpcalcx = calcx Mod 8
mov eax, [calcx]
And eax, 7
mov [tmpcalcx], eax
'this code replaces: ntaddr = ntmap(usent)
mov ebx, [ntmapptr]
mov esi, [usent]
shl esi, 2
mov eax, [ebx+esi]
mov [ntaddr], eax
'this code replaces: ntval = VRAM(ntaddr + ((calcy And 248) Shl 2) + (calcx Shr 3))
mov eax, [calcy]
And eax, 248
Shl eax, 2
mov ebx, [calcx]
Shr ebx, 3
Add eax, ebx
Add eax, [ntaddr]
mov [temp1], eax
End Asm
ntval = VRAM(temp1)
'this code replaces: patternoffset = PPU.bgtable + (ntval Shl 4) + (calcy Mod 8)
temp1 = PPU.bgtable
Asm
mov eax, [ntval]
Shl eax, 4
mov ebx, [calcy]
And ebx, 7
add eax, ebx
Add eax, [temp1]
'patternoffset = chrbank(patternoffset Shr 10) + (patternoffset Mod 1024)
mov ebx, 1024
Xor edx, edx
div eax, ebx
mov [temp1], eax
mov [temp2], edx
End Asm
'this code replaces: pixel = (CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1
patternoffset = chrbank(temp1) + temp2
pixel = CHRMEM(patternoffset)
Asm
mov al, 7
Sub al, [tmpcalcx]
mov [bitposit], al
mov eax, [pixel]
mov cl, [bitposit]
Shr eax, cl
And eax, 1
mov [pixel], eax
'patternoffset = patternoffset + 8
mov eax, [patternoffset]
Add eax, 8
mov [patternoffset], eax
End Asm
'this replaces: pixel = pixel + (((CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1) Shl 1)
pixel2 = CHRMEM(patternoffset)
Asm
mov eax, [pixel2]
mov cl, [bitposit]
Shr eax, cl
And eax, 1
Shl eax, 1
mov ebx, [pixel]
Or eax, ebx
mov [pixel], eax
'this replaces: attribx = calcx Shr 5: attriby = calcy Shr 5
mov eax, [calcx]
Shr eax, 5
mov [attribx], eax
mov eax, [calcy]
Shr eax, 5
mov [attriby], eax
'this replaces: attribbyte = VRAM(ntaddr + &h3C0 + (attriby Shl 3) + attribx)
mov esi, [attriby]
Shl esi, 3
Add esi, [attribx]
Add esi, &h3C0
Add esi, [ntaddr]
mov ebx, [VRAMptr]
mov eax, [ebx+esi]
mov [attribbyte], eax
End Asm
attribbyte = attribbyte Shr (((calcx And 16) Shr 3) Or ((calcy And 16) Shr 2)) And 3
'this next part was to replace the above line, but it doesnt work for some reason
' Asm
' mov ecx, [calcx]
' And ecx, 16
' Shr ecx, 3
' mov edx, [calcy]
' And edx, 16
' Shr edx, 2
' Or ecx, edx
' And ecx, 3
' mov eax, [attribbyte]
' Shr eax, cl
' Shl eax, 2
' Add eax, [pixel]
' Add eax, &h3F00
' mov [pixel], eax
'End Asm
pixel = pixel + (attribbyte Shl 2)
backgnd(drx) = PPUread(&h3F00 + pixel)
#EndMacro
this is going to tough to get running fast! if possible, i'd like to see full speeds on 1 GHz processors! i know that's probably not going to be possible in FB... or is it?