how to make this much faster?

General FreeBASIC programming questions.
Mike Chambers
Posts: 85
Joined: Jun 18, 2006 19:48

how to make this much faster?

Post by Mike Chambers »

this is the code from my NES emulator that renders the background image into the buffer... all this code has to be run once for every pixel! (256x240 @ 60 FPS)

it's very slow...

Code: Select all

#Macro drawbackground(dy, dx)
        Select Case As Const PPU.nametable
                Case 0: calcx = dx: calcy = dy
                Case 1: calcx = dx + 256: calcy = dy
                Case 2: calcx = dx: calcy = dy + 240
                Case 3: calcx = dx + 256: calcy = dy + 240
        End Select
        calcx = (calcx + PPU.hscroll) Mod 512
        calcy = (calcy + PPU.vscroll) Mod 480
        If (calcx < 256) And (calcy < 240) Then
                usent = 0
        ElseIf (calcx > 255) And (calcy < 240) Then
                usent = 1
        ElseIf (calcx < 256) And (calcy > 239) Then
                usent = 2
        ElseIf (calcx > 255) And (calcy > 239) Then
                usent = 3
        End If
        calcx = calcx Mod 256
        calcy = calcy Mod 240
        tmpcalcx = calcx Mod 8
        ntaddr = ntmap(usent)
        ntval = VRAM(ntaddr + ((calcy And 248) Shl 2) + (calcx Shr 3))
        patternoffset = PPU.bgtable + (ntval Shl 4) + (calcy Mod 8)
        patternoffset = chrbank(patternoffset Shr 10) + (patternoffset Mod 1024)
        usebank = patternoffset Shr 10
        usebankmod = patternoffset Mod 1024
        pixel = (CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1
        patternoffset = patternoffset + 8
        usebank = patternoffset Shr 10
        usebankmod = patternoffset Mod 1024
        pixel = pixel + (((CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1) Shl 1)
        attribx = calcx Shr 5: attriby = calcy Shr 5
        attribbyte = VRAM(ntaddr + &h3C0 + (attriby Shl 3) + attribx)
        attribbyte = attribbyte Shr (((calcx And 16) Shr 3) Or ((calcy And 16) Shr 2)) And 3
        pixel = pixel + (attribbyte Shl 2)
        backgnd(dx) = VRAM(&H3F00 + pixel)
#EndMacro
i've rewritten it in (admittedly somewhat sloppy) mostly assembly as follows, but it's still pretty slow. this routine is the biggest, by far, bottleneck in the program. even with this asm, you need a good 2+ GHz CPU to run full speed 60 FPS!

Code: Select all

#Macro drawbackground(dry, drx)
	VRAMptr = @VRAM(0)
	ntmapptr = @ntmap(0)
	calcx = drx: calcy = dry
	Select Case As Const PPU.nametable
		Case 1: 'calcx = dx + 256: calcy = dy
			Asm
				mov eax, [calcx]
				Add eax, 256
				mov [calcx], eax
			End Asm
		Case 2: 'calcx = dx: calcy = dy + 240
			Asm
				mov eax, [calcy]
				Add eax, 240
				mov [calcy], eax
			End Asm
		Case 3: 'calcx = dx + 256: calcy = dy + 240
			Asm
				mov eax, [calcx]
				Add eax, 256
				mov [calcx], eax
				mov eax, [calcy]
				Add eax, 240
				mov [calcy], eax
			End Asm
	End Select
	temp1 = PPU.hscroll
	temp2 = PPU.vscroll
	Asm
		mov eax, [calcx]
		Add eax, [temp1]
		And eax, 511
		mov [calcx], eax
		mov eax, [calcy]
		Add eax, [temp2]
		mov ebx, 480
		Xor edx, edx
		div eax, ebx
		mov [calcy], edx
	'End Asm

   	'this code replaces: usent = (calcx Shr 8) + (calcy Shr 8)*2
		mov eax, [calcx]
		Shr eax, 8
		mov ebx, [calcy]
		Shr ebx, 8
		Shl ebx, 1
		Or eax, ebx
		mov [usent], eax
		Shl ebx, 3
		mov eax, [calcy]
		Sub eax, ebx
		mov [calcy], eax

      'this code replaces:
   	'calcx = calcx Mod 256
	   'calcy = calcy Mod 240
		mov eax, [calcx]
		And eax, 255
		mov [calcx], eax
		mov eax, [calcy]
		mov ebx, 240
		Xor edx, edx
		div eax, ebx
		mov [calcy], edx
		
		'this code replaces: tmpcalcx = calcx Mod 8
		mov eax, [calcx]
		And eax, 7
		mov [tmpcalcx], eax
		
		'this code replaces: ntaddr = ntmap(usent)
		mov ebx, [ntmapptr]
		mov esi, [usent]
		shl esi, 2
		mov eax, [ebx+esi]
		mov [ntaddr], eax

	  'this code replaces: ntval = VRAM(ntaddr + ((calcy And 248) Shl 2) + (calcx Shr 3))
		mov eax, [calcy]
		And eax, 248
		Shl eax, 2
		mov ebx, [calcx]
		Shr ebx, 3
		Add eax, ebx
		Add eax, [ntaddr]
		mov [temp1], eax
	End Asm
	ntval = VRAM(temp1)
	
	'this code replaces: patternoffset = PPU.bgtable + (ntval Shl 4) + (calcy Mod 8)
	temp1 = PPU.bgtable
	Asm
		mov eax, [ntval]
		Shl eax, 4
		mov ebx, [calcy]
		And ebx, 7
		add eax, ebx
		Add eax, [temp1]
	'patternoffset = chrbank(patternoffset Shr 10) + (patternoffset Mod 1024)
		mov ebx, 1024
		Xor edx, edx
		div eax, ebx
		mov [temp1], eax
		mov [temp2], edx
	End Asm
	
	'this code replaces: pixel = (CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1
	patternoffset = chrbank(temp1) + temp2
	pixel = CHRMEM(patternoffset)
	Asm
		mov al, 7
		Sub al, [tmpcalcx]
		mov [bitposit], al
		mov eax, [pixel]
		mov cl, [bitposit]
		Shr eax, cl
		And eax, 1
		mov [pixel], eax
		'patternoffset = patternoffset + 8
		mov eax, [patternoffset]
		Add eax, 8
		mov [patternoffset], eax
	End Asm
	
	'this replaces: pixel = pixel + (((CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1) Shl 1)
	pixel2 = CHRMEM(patternoffset)
	Asm
		mov eax, [pixel2]
		mov cl, [bitposit]
		Shr eax, cl
		And eax, 1
		Shl eax, 1
		mov ebx, [pixel]
		Or eax, ebx
		mov [pixel], eax

		'this replaces: attribx = calcx Shr 5: attriby = calcy Shr 5
			mov eax, [calcx]
			Shr eax, 5
			mov [attribx], eax
			mov eax, [calcy]
			Shr eax, 5
			mov [attriby], eax

		'this replaces: attribbyte = VRAM(ntaddr + &h3C0 + (attriby Shl 3) + attribx)
			mov esi, [attriby]
			Shl esi, 3
			Add esi, [attribx]
			Add esi, &h3C0
			Add esi, [ntaddr]
			mov ebx, [VRAMptr]
			mov eax, [ebx+esi]
			mov [attribbyte], eax
	End Asm
	
		attribbyte = attribbyte Shr (((calcx And 16) Shr 3) Or ((calcy And 16) Shr 2)) And 3
	'this next part was to replace the above line, but it doesnt work for some reason
	'	Asm
	'		mov ecx, [calcx]
	'		And ecx, 16
	'		Shr ecx, 3
	'		mov edx, [calcy]
	'		And edx, 16
	'		Shr edx, 2
	'		Or ecx, edx
	'		And ecx, 3
	'		mov eax, [attribbyte]
	'		Shr eax, cl
	'		Shl eax, 2
	'		Add eax, [pixel]
	'		Add eax, &h3F00
	'		mov [pixel], eax
		'End Asm
		pixel = pixel + (attribbyte Shl 2)
	backgnd(drx) = PPUread(&h3F00 + pixel)
#EndMacro
this is going to tough to get running fast! if possible, i'd like to see full speeds on 1 GHz processors! i know that's probably not going to be possible in FB... or is it?
D.J.Peters
Posts: 8586
Joined: May 28, 2005 3:28
Contact:

Post by D.J.Peters »

the most parts can be precalculated once of course you need an array off (256,240) values
or the most parts only changed if dy=new scanline
than you need only a table with 240 precalculated values.

Joshy
Mike Chambers
Posts: 85
Joined: Jun 18, 2006 19:48

Post by Mike Chambers »

D.J.Peters wrote:the most parts can be precalculated once of course you need an array off (256,240) values
or the most parts only changed if dy=new scanline
than you need only a table with 240 precalculated values.

Joshy
well, the thing is the background is generally constantly changing and scrolling around so there are always new values that the pixels will have.

the NES background rendering basically works like this: (highly simplified, if youre interested in more detail there are good docs on nesdev.parodius.com )

1. there are four "nametables" in memory that are looked at to see what 8x8 image tile is to be drawn for each 8x8 section of the background. horizontal and vertical scroll data is applied before grabbing the pixel, which can change exactly where these small sections begin on the screen.

2. the first two low bits of the corresponding pixel data is pulled from whats called CHR memory.

3. just above each nametable's array in video RAM, is that nametable's "attribute table"... from there is pulled the final third and highest bit of the pixel's palette value.


based on this i was thinking one possibility would be to keep a precalculated cache of all the lower two bits of each pixel value, then just calculate the attribute value (which can be changed whenever to make the same tile blocks have a different palette) and add that to it for every iteration.

the problem there though, is that while a lot of games have the CHR tile data stored in non-changing ROM, other games use CHR-RAM and these tile graphics themselves can be changed at will.

it gets very complicated!
MichaelW
Posts: 3500
Joined: May 16, 2006 22:34
Location: USA

Post by MichaelW »

Can you replace the slow MOD 480 and MOD 240 operations with table lookups?
TJF
Posts: 3809
Joined: Dec 06, 2009 22:27
Location: N47°, E15°
Contact:

Post by TJF »

MichaelW wrote:Can you replace the slow MOD 480 and MOD 240 operations with table lookups?
In the ASM code he allready replaced them by the faster AND operations.
MichaelW
Posts: 3500
Joined: May 16, 2006 22:34
Location: USA

Post by MichaelW »

TJF wrote:
MichaelW wrote:Can you replace the slow MOD 480 and MOD 240 operations with table lookups?
In the ASM code he allready replaced them by the faster AND operations.
An (n MOD m) can be replaced with (n AND m-1) only if m is a power of 2. For the MOD 480 and MOD 240 operations the ASM code uses DIV.
TJF
Posts: 3809
Joined: Dec 06, 2009 22:27
Location: N47°, E15°
Contact:

Post by TJF »

MichaelW wrote:An (n MOD m) can be replaced with (n AND m-1) only if m is a power of 2. For the MOD 480 and MOD 240 operations the ASM code uses DIV.
Yes, you're right. Sorry, my misstake.

Found something

Code: Select all

' instead of
        'Select Case As Const PPU.nametable
                'Case 0: calcx = dx: calcy = dy
                'Case 1: calcx = dx + 256: calcy = dy
                'Case 2: calcx = dx: calcy = dy + 240
                'Case 3: calcx = dx + 256: calcy = dy + 240
        'End Select
        'calcx = (calcx + PPU.hscroll) Mod 512
        'calcy = (calcy + PPU.vscroll) Mod 480
' use
  SELECT CASE AS CONST PPU.nametable
    CASE 0
      calcx = (dx + PPU.hscroll) AND 511
      calcy = (dy + PPU.vscroll) MOD 480 ' table lookup? (MichaelW)
    CASE 1
      calcx = (dx + 256 + PPU.hscroll) AND 511
      calcy = (dy + PPU.vscroll) MOD 480 ' table lookup?
    CASE 2
      calcx = (dx + PPU.hscroll) AND 511
      calcy = (dy + 240 + PPU.vscroll) MOD 480 ' table lookup?
    CASE ELSE
      calcx = (dx + 256 + PPU.hscroll) AND 511
      calcy = (dy + 240 + PPU.vscroll) MOD 480 ' table lookup?
  END SELECT


' instead of
        'If (calcx < 256) And (calcy < 240) Then
                'usent = 0
        'ElseIf (calcx > 255) And (calcy < 240) Then
                'usent = 1
        'ElseIf (calcx < 256) And (calcy > 239) Then
                'usent = 2
        'ElseIf (calcx > 255) And (calcy > 239) Then
                'usent = 3
        'End If
        'calcx = calcx Mod 256
        'calcy = calcy Mod 240
' use
  usent = -((calcx >= 256) + (calcy >= 240) SHL 1)
  calcx AND= 255
  calcy MOD= 240 ' table lookup here? (MichaelW)
TJF
Posts: 3809
Joined: Dec 06, 2009 22:27
Location: N47°, E15°
Contact:

Post by TJF »

Here's my version of the complete macro

Code: Select all

#MACRO drawbackground(dy, dx)
  SELECT CASE AS CONST PPU.nametable
    CASE 0
      calcx = (dx + PPU.hscroll) AND 511
      calcy = (dy + PPU.vscroll) MOD 480 ' table lookup here? (MichaelW)
    CASE 1
      calcx = (dx + 256 + PPU.hscroll) AND 511
      calcy = (dy + PPU.vscroll) MOD 480 ' table lookup?
    CASE 2
      calcx = (dx + PPU.hscroll) AND 511
      calcy = (dy + 240 + PPU.vscroll) MOD 480 ' table lookup?
    CASE ELSE
      calcx = (dx + 256 + PPU.hscroll) AND 511
      calcy = (dy + 240 + PPU.vscroll) MOD 480 ' table lookup?
  END SELECT
  ntaddr = ntmap(-((calcx >= 256) + (calcy >= 240) SHL 1))
  calcx AND= 255
  calcy MOD= 240 ' table lookup here? (MichaelW)

  ntval = VRAM(ntaddr + ((calcy AND 248) SHL 2) + (calcx SHR 3))
  tmpcalcx = 7 - (calcx AND 7)

  patternoffset = PPU.bgtable + (ntval SHL 4) + (calcy AND 7)
  patternoffset = chrbank(patternoffset SHR 10) + (patternoffset AND 1023)

  usebank = patternoffset SHR 10
  usebankmod = patternoffset AND 1023
  pixel = (CHRMEM(patternoffset) SHR tmpcalcx) AND 1

  patternoffset += 8
  usebank = patternoffset SHR 10
  usebankmod = patternoffset AND 1023
  pixel += ((CHRMEM(patternoffset) SHR tmpcalcx) AND 1) SHL 1

  attribx = calcx SHR 5 : attriby = calcy SHR 5
  attribbyte = (VRAM(ntaddr + &h3C0 + (attriby SHL 3) + attribx)) SHR _
               (((calcx AND 16) SHR 3) OR ((calcy AND 16) SHR 2)) AND 3

  pixel += attribbyte SHL 2
  backgnd(dx) = VRAM(&H3F00 + pixel)
#ENDMACRO
NOTE: this version has no usent variable anymore. I don't know if it's used outside the macro.
Mike Chambers
Posts: 85
Joined: Jun 18, 2006 19:48

Post by Mike Chambers »

thanks, TJF. i will try that when i get home, i am at work right now. at home i have an old 1.13 GHz P3 laptop that is good to try these optimizations on to see how much faster i can get it running. as it stands, i have to run it on there with a frameskip of 9 (!) to get full speed which as you can guess makes it just about unplayable.

if you want to see what kind of speed it's getting now, here is a ZIP with the compiled win32 exec, SDL.dll, as well as a folder containing all of the source.

http://rubbermallet.org/moarnes-0.11.05.05.zip

i've included a couple games that play particularly well in it. megaman 2 i've played all the way to the end twice in it. :)

run moarnes.exe without any parameters to see the command line help.
Mike Chambers
Posts: 85
Joined: Jun 18, 2006 19:48

Post by Mike Chambers »

TJF, your code is very slightly faster than my original pure-FB macro but definitely slower than the assembly version.

also, my original paste was apparently from a slightly old version and the last line: backgnd(dx) = VRAM(&H3F00 + pixel)

actually should have been backgnd(dx) = PPUread(&H3F00 + pixel)

PPUread is a function that performs some checking of the address being read, because certain values can change if they are from certain addresses. (the NES PPU memory map is weird. very weird.)

but calling that function was slowing it WAY down, i moved a couple lines regarding the palette address checking from the PPUread function down to above that backgnd(dx) = blah line do check the value of pixel and then read directly from that VRAM array. it's much faster, but still not amazing.

i guess it's just kind of going to be impossible to hope for too much more speed out of FreeBASIC. i've at least got it running full speed on just about any computer faster than 2.0 to 2.2 GHz or so (depending on exact chip) and that should probably suffice considering modern hardware.

(as long as youre not on a netbook with an atom...)

i think i will see what happens replacing the mod 480 stuff with a lookup table like was suggested, that's a good idea.
TJF
Posts: 3809
Joined: Dec 06, 2009 22:27
Location: N47°, E15°
Contact:

Post by TJF »

Mike Chambers wrote:TJF, your code is very slightly faster than my original pure-FB macro but definitely slower than the assembly version.
Thanks for feedback! (Surprising, it works without bugfixing?)

I'm not keen on ASM code, so I can't help any further. But you may optimize by generating an ASM version based on the new FB code?
Mike Chambers
Posts: 85
Joined: Jun 18, 2006 19:48

Post by Mike Chambers »

thanks for giving it a shot! :)

and yeah, no bugfixing at all was required.
MichaelW
Posts: 3500
Joined: May 16, 2006 22:34
Location: USA

Post by MichaelW »

To get an idea of how much the DIVs are slowing the code down I extracted some representative code and timed it with and without the DIVs.

Code: Select all

'==============================================================================
#include "windows.bi"
#include "counter.bas"
'==============================================================================
'' Counter.bas is available here:
''
'' http://www.freebasic.net/forum/viewtopic.php?t=4221#4221
'==============================================================================

dim as integer calcx,temp1,calcy,temp2,tmpcalcx,ntmapptr,usent,ntaddr

sleep 3000

counter_begin( 1000, HIGH_PRIORITY_CLASS )
counter_end
print counter_cycles;" cycles, empty"

counter_begin( 1000, HIGH_PRIORITY_CLASS )
    Asm
        mov eax, [calcx]
        Add eax, [temp1]
        And eax, 511
        mov [calcx], eax
        mov eax, [calcy]
        Add eax, [temp2]
        mov ebx, 480
        Xor edx, edx
        div eax, ebx
        mov [calcy], edx
        'this code replaces: usent = (calcx Shr 8) + (calcy Shr 8)*2
        mov eax, [calcx]
        Shr eax, 8
        mov ebx, [calcy]
        Shr ebx, 8
        Shl ebx, 1
        Or eax, ebx
        mov [usent], eax
        Shl ebx, 3
        mov eax, [calcy]
        Sub eax, ebx
        mov [calcy], eax
          'this code replaces:
          'calcx = calcx Mod 256
          'calcy = calcy Mod 240
        mov eax, [calcx]
        And eax, 255
        mov [calcx], eax
        mov eax, [calcy]
        mov ebx, 240
        Xor edx, edx
        div eax, ebx
        mov [calcy], edx
          'this code replaces: tmpcalcx = calcx Mod 8
        mov eax, [calcx]
        And eax, 7
        mov [tmpcalcx], eax
          'this code replaces: ntaddr = ntmap(usent)
        mov ebx, [ntmapptr]
        mov esi, [usent]
        shl esi, 2
        ''''mov eax, [ebx+esi]
        mov [ntaddr], eax
          'this code replaces: ntval = VRAM(ntaddr +
          ' ((calcy And 248) Shl 2) + (calcx Shr 3))
        mov eax, [calcy]
        And eax, 248
        Shl eax, 2
        mov ebx, [calcx]
        Shr ebx, 3
        Add eax, ebx
        Add eax, [ntaddr]
        mov [temp1], eax
    End Asm
counter_end
print counter_cycles;" cycles, with div"

counter_begin( 1000, HIGH_PRIORITY_CLASS )
    Asm
        mov eax, [calcx]
        Add eax, [temp1]
        And eax, 511
        mov [calcx], eax
        mov eax, [calcy]
        Add eax, [temp2]
        mov ebx, 480
        Xor edx, edx
        ''''div eax, ebx
        mov [calcy], edx
          'this code replaces: usent = (calcx Shr 8) + (calcy Shr 8)*2
        mov eax, [calcx]
        Shr eax, 8
        mov ebx, [calcy]
        Shr ebx, 8
        Shl ebx, 1
        Or eax, ebx
        mov [usent], eax
        Shl ebx, 3
        mov eax, [calcy]
        Sub eax, ebx
        mov [calcy], eax
          'this code replaces:
          'calcx = calcx Mod 256
          'calcy = calcy Mod 240
        mov eax, [calcx]
        And eax, 255
        mov [calcx], eax
        mov eax, [calcy]
        mov ebx, 240
        Xor edx, edx
        ''''div eax, ebx
        mov [calcy], edx
          'this code replaces: tmpcalcx = calcx Mod 8
        mov eax, [calcx]
        And eax, 7
        mov [tmpcalcx], eax
          'this code replaces: ntaddr = ntmap(usent)
        mov ebx, [ntmapptr]
        mov esi, [usent]
        shl esi, 2
        ''''mov eax, [ebx+esi]
        mov [ntaddr], eax
          'this code replaces: ntval = VRAM(ntaddr +
          ' ((calcy And 248) Shl 2) + (calcx Shr 3))
        mov eax, [calcy]
        And eax, 248
        Shl eax, 2
        mov ebx, [calcx]
        Shr ebx, 3
        Add eax, ebx
        Add eax, [ntaddr]
        mov [temp1], eax
    End Asm
counter_end
print counter_cycles;" cycles, w/o div"

sleep

Running on a P3:

Code: Select all

111 cycles, with div
26 cycles, w/o div
AGS
Posts: 1284
Joined: Sep 25, 2007 0:26
Location: the Netherlands

Post by AGS »

It works great on my pc (pentium i7@2 ghz). There is quite a bit of noise on the sound output (you hear some 'cracks' and 'pops' every now and then).

I saw a couple of NES emulators that were able to emulate on a 200mhz machine. But those were programmed in delphi or some other language that compiles to highly optimized code.

Perhaps you could use the -gen gcc -O 2 command line option. Sometimes using the C back end can speed things up (sometimes but not always). Or redesign your program?

Anyway, nice program, nice games (king kong!).
DamageX
Posts: 130
Joined: Nov 21, 2009 8:42

Post by DamageX »

I used to play NES games in an emulator called NESticle back in the day which ran full speed on my 486. FB won`t be as fast as the fastest assembly but I`m sure you could do better than needing a 1Ghz+ CPU.

You are calculating the name table address and the character pattern address from scratch for every pixel. This is a lot of redundant calculation. Once you`ve calculated the name table address at the beginning of one scanline you can read the remaining bytes for that line sequentially (except when hittting the boundary between name tables during horizontal scrolling). Also, once you`ve calculated the character pattern address you might as well get all eight pixels from it for that line instead of only one.

BTW, instead of using MOD to make calcy wrap at 240 how about an if-then?

label: if calcy>239 then calcy=calcy-240:goto label

since the maximum possible value is only 239+255 right? (GOTO avoiders can use a while loop or something instead)
Post Reply