how to make this much faster?

General FreeBASIC programming questions.
Mike Chambers
Posts: 85
Joined: Jun 18, 2006 19:48

how to make this much faster?

Postby Mike Chambers » May 05, 2011 0:45

this is the code from my NES emulator that renders the background image into the buffer... all this code has to be run once for every pixel! (256x240 @ 60 FPS)

it's very slow...

Code: Select all

#Macro drawbackground(dy, dx)
        Select Case As Const PPU.nametable
                Case 0: calcx = dx: calcy = dy
                Case 1: calcx = dx + 256: calcy = dy
                Case 2: calcx = dx: calcy = dy + 240
                Case 3: calcx = dx + 256: calcy = dy + 240
        End Select
        calcx = (calcx + PPU.hscroll) Mod 512
        calcy = (calcy + PPU.vscroll) Mod 480
        If (calcx < 256) And (calcy < 240) Then
                usent = 0
        ElseIf (calcx > 255) And (calcy < 240) Then
                usent = 1
        ElseIf (calcx < 256) And (calcy > 239) Then
                usent = 2
        ElseIf (calcx > 255) And (calcy > 239) Then
                usent = 3
        End If
        calcx = calcx Mod 256
        calcy = calcy Mod 240
        tmpcalcx = calcx Mod 8
        ntaddr = ntmap(usent)
        ntval = VRAM(ntaddr + ((calcy And 248) Shl 2) + (calcx Shr 3))
        patternoffset = PPU.bgtable + (ntval Shl 4) + (calcy Mod 8)
        patternoffset = chrbank(patternoffset Shr 10) + (patternoffset Mod 1024)
        usebank = patternoffset Shr 10
        usebankmod = patternoffset Mod 1024
        pixel = (CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1
        patternoffset = patternoffset + 8
        usebank = patternoffset Shr 10
        usebankmod = patternoffset Mod 1024
        pixel = pixel + (((CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1) Shl 1)
        attribx = calcx Shr 5: attriby = calcy Shr 5
        attribbyte = VRAM(ntaddr + &h3C0 + (attriby Shl 3) + attribx)
        attribbyte = attribbyte Shr (((calcx And 16) Shr 3) Or ((calcy And 16) Shr 2)) And 3
        pixel = pixel + (attribbyte Shl 2)
        backgnd(dx) = VRAM(&H3F00 + pixel)
#EndMacro


i've rewritten it in (admittedly somewhat sloppy) mostly assembly as follows, but it's still pretty slow. this routine is the biggest, by far, bottleneck in the program. even with this asm, you need a good 2+ GHz CPU to run full speed 60 FPS!

Code: Select all

#Macro drawbackground(dry, drx)
   VRAMptr = @VRAM(0)
   ntmapptr = @ntmap(0)
   calcx = drx: calcy = dry
   Select Case As Const PPU.nametable
      Case 1: 'calcx = dx + 256: calcy = dy
         Asm
            mov eax, [calcx]
            Add eax, 256
            mov [calcx], eax
         End Asm
      Case 2: 'calcx = dx: calcy = dy + 240
         Asm
            mov eax, [calcy]
            Add eax, 240
            mov [calcy], eax
         End Asm
      Case 3: 'calcx = dx + 256: calcy = dy + 240
         Asm
            mov eax, [calcx]
            Add eax, 256
            mov [calcx], eax
            mov eax, [calcy]
            Add eax, 240
            mov [calcy], eax
         End Asm
   End Select
   temp1 = PPU.hscroll
   temp2 = PPU.vscroll
   Asm
      mov eax, [calcx]
      Add eax, [temp1]
      And eax, 511
      mov [calcx], eax
      mov eax, [calcy]
      Add eax, [temp2]
      mov ebx, 480
      Xor edx, edx
      div eax, ebx
      mov [calcy], edx
   'End Asm

      'this code replaces: usent = (calcx Shr 8) + (calcy Shr 8)*2
      mov eax, [calcx]
      Shr eax, 8
      mov ebx, [calcy]
      Shr ebx, 8
      Shl ebx, 1
      Or eax, ebx
      mov [usent], eax
      Shl ebx, 3
      mov eax, [calcy]
      Sub eax, ebx
      mov [calcy], eax

      'this code replaces:
      'calcx = calcx Mod 256
      'calcy = calcy Mod 240
      mov eax, [calcx]
      And eax, 255
      mov [calcx], eax
      mov eax, [calcy]
      mov ebx, 240
      Xor edx, edx
      div eax, ebx
      mov [calcy], edx
      
      'this code replaces: tmpcalcx = calcx Mod 8
      mov eax, [calcx]
      And eax, 7
      mov [tmpcalcx], eax
      
      'this code replaces: ntaddr = ntmap(usent)
      mov ebx, [ntmapptr]
      mov esi, [usent]
      shl esi, 2
      mov eax, [ebx+esi]
      mov [ntaddr], eax

     'this code replaces: ntval = VRAM(ntaddr + ((calcy And 248) Shl 2) + (calcx Shr 3))
      mov eax, [calcy]
      And eax, 248
      Shl eax, 2
      mov ebx, [calcx]
      Shr ebx, 3
      Add eax, ebx
      Add eax, [ntaddr]
      mov [temp1], eax
   End Asm
   ntval = VRAM(temp1)
   
   'this code replaces: patternoffset = PPU.bgtable + (ntval Shl 4) + (calcy Mod 8)
   temp1 = PPU.bgtable
   Asm
      mov eax, [ntval]
      Shl eax, 4
      mov ebx, [calcy]
      And ebx, 7
      add eax, ebx
      Add eax, [temp1]
   'patternoffset = chrbank(patternoffset Shr 10) + (patternoffset Mod 1024)
      mov ebx, 1024
      Xor edx, edx
      div eax, ebx
      mov [temp1], eax
      mov [temp2], edx
   End Asm
   
   'this code replaces: pixel = (CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1
   patternoffset = chrbank(temp1) + temp2
   pixel = CHRMEM(patternoffset)
   Asm
      mov al, 7
      Sub al, [tmpcalcx]
      mov [bitposit], al
      mov eax, [pixel]
      mov cl, [bitposit]
      Shr eax, cl
      And eax, 1
      mov [pixel], eax
      'patternoffset = patternoffset + 8
      mov eax, [patternoffset]
      Add eax, 8
      mov [patternoffset], eax
   End Asm
   
   'this replaces: pixel = pixel + (((CHRMEM(patternoffset) Shr (7 - tmpcalcx)) And 1) Shl 1)
   pixel2 = CHRMEM(patternoffset)
   Asm
      mov eax, [pixel2]
      mov cl, [bitposit]
      Shr eax, cl
      And eax, 1
      Shl eax, 1
      mov ebx, [pixel]
      Or eax, ebx
      mov [pixel], eax

      'this replaces: attribx = calcx Shr 5: attriby = calcy Shr 5
         mov eax, [calcx]
         Shr eax, 5
         mov [attribx], eax
         mov eax, [calcy]
         Shr eax, 5
         mov [attriby], eax

      'this replaces: attribbyte = VRAM(ntaddr + &h3C0 + (attriby Shl 3) + attribx)
         mov esi, [attriby]
         Shl esi, 3
         Add esi, [attribx]
         Add esi, &h3C0
         Add esi, [ntaddr]
         mov ebx, [VRAMptr]
         mov eax, [ebx+esi]
         mov [attribbyte], eax
   End Asm
   
      attribbyte = attribbyte Shr (((calcx And 16) Shr 3) Or ((calcy And 16) Shr 2)) And 3
   'this next part was to replace the above line, but it doesnt work for some reason
   '   Asm
   '      mov ecx, [calcx]
   '      And ecx, 16
   '      Shr ecx, 3
   '      mov edx, [calcy]
   '      And edx, 16
   '      Shr edx, 2
   '      Or ecx, edx
   '      And ecx, 3
   '      mov eax, [attribbyte]
   '      Shr eax, cl
   '      Shl eax, 2
   '      Add eax, [pixel]
   '      Add eax, &h3F00
   '      mov [pixel], eax
      'End Asm
      pixel = pixel + (attribbyte Shl 2)
   backgnd(drx) = PPUread(&h3F00 + pixel)
#EndMacro


this is going to tough to get running fast! if possible, i'd like to see full speeds on 1 GHz processors! i know that's probably not going to be possible in FB... or is it?
D.J.Peters
Posts: 8210
Joined: May 28, 2005 3:28
Contact:

Postby D.J.Peters » May 05, 2011 0:59

the most parts can be precalculated once of course you need an array off (256,240) values
or the most parts only changed if dy=new scanline
than you need only a table with 240 precalculated values.

Joshy
Mike Chambers
Posts: 85
Joined: Jun 18, 2006 19:48

Postby Mike Chambers » May 05, 2011 1:09

D.J.Peters wrote:the most parts can be precalculated once of course you need an array off (256,240) values
or the most parts only changed if dy=new scanline
than you need only a table with 240 precalculated values.

Joshy


well, the thing is the background is generally constantly changing and scrolling around so there are always new values that the pixels will have.

the NES background rendering basically works like this: (highly simplified, if youre interested in more detail there are good docs on nesdev.parodius.com )

1. there are four "nametables" in memory that are looked at to see what 8x8 image tile is to be drawn for each 8x8 section of the background. horizontal and vertical scroll data is applied before grabbing the pixel, which can change exactly where these small sections begin on the screen.

2. the first two low bits of the corresponding pixel data is pulled from whats called CHR memory.

3. just above each nametable's array in video RAM, is that nametable's "attribute table"... from there is pulled the final third and highest bit of the pixel's palette value.


based on this i was thinking one possibility would be to keep a precalculated cache of all the lower two bits of each pixel value, then just calculate the attribute value (which can be changed whenever to make the same tile blocks have a different palette) and add that to it for every iteration.

the problem there though, is that while a lot of games have the CHR tile data stored in non-changing ROM, other games use CHR-RAM and these tile graphics themselves can be changed at will.

it gets very complicated!
MichaelW
Posts: 3500
Joined: May 16, 2006 22:34
Location: USA

Postby MichaelW » May 05, 2011 6:04

Can you replace the slow MOD 480 and MOD 240 operations with table lookups?
TJF
Posts: 3609
Joined: Dec 06, 2009 22:27
Location: N47°, E15°
Contact:

Postby TJF » May 05, 2011 6:23

MichaelW wrote:Can you replace the slow MOD 480 and MOD 240 operations with table lookups?


In the ASM code he allready replaced them by the faster AND operations.
MichaelW
Posts: 3500
Joined: May 16, 2006 22:34
Location: USA

Postby MichaelW » May 05, 2011 6:31

TJF wrote:
MichaelW wrote:Can you replace the slow MOD 480 and MOD 240 operations with table lookups?


In the ASM code he allready replaced them by the faster AND operations.

An (n MOD m) can be replaced with (n AND m-1) only if m is a power of 2. For the MOD 480 and MOD 240 operations the ASM code uses DIV.
TJF
Posts: 3609
Joined: Dec 06, 2009 22:27
Location: N47°, E15°
Contact:

Postby TJF » May 05, 2011 7:23

MichaelW wrote:An (n MOD m) can be replaced with (n AND m-1) only if m is a power of 2. For the MOD 480 and MOD 240 operations the ASM code uses DIV.

Yes, you're right. Sorry, my misstake.

Found something

Code: Select all

' instead of
        'Select Case As Const PPU.nametable
                'Case 0: calcx = dx: calcy = dy
                'Case 1: calcx = dx + 256: calcy = dy
                'Case 2: calcx = dx: calcy = dy + 240
                'Case 3: calcx = dx + 256: calcy = dy + 240
        'End Select
        'calcx = (calcx + PPU.hscroll) Mod 512
        'calcy = (calcy + PPU.vscroll) Mod 480
' use
  SELECT CASE AS CONST PPU.nametable
    CASE 0
      calcx = (dx + PPU.hscroll) AND 511
      calcy = (dy + PPU.vscroll) MOD 480 ' table lookup? (MichaelW)
    CASE 1
      calcx = (dx + 256 + PPU.hscroll) AND 511
      calcy = (dy + PPU.vscroll) MOD 480 ' table lookup?
    CASE 2
      calcx = (dx + PPU.hscroll) AND 511
      calcy = (dy + 240 + PPU.vscroll) MOD 480 ' table lookup?
    CASE ELSE
      calcx = (dx + 256 + PPU.hscroll) AND 511
      calcy = (dy + 240 + PPU.vscroll) MOD 480 ' table lookup?
  END SELECT


' instead of
        'If (calcx < 256) And (calcy < 240) Then
                'usent = 0
        'ElseIf (calcx > 255) And (calcy < 240) Then
                'usent = 1
        'ElseIf (calcx < 256) And (calcy > 239) Then
                'usent = 2
        'ElseIf (calcx > 255) And (calcy > 239) Then
                'usent = 3
        'End If
        'calcx = calcx Mod 256
        'calcy = calcy Mod 240
' use
  usent = -((calcx >= 256) + (calcy >= 240) SHL 1)
  calcx AND= 255
  calcy MOD= 240 ' table lookup here? (MichaelW)
TJF
Posts: 3609
Joined: Dec 06, 2009 22:27
Location: N47°, E15°
Contact:

Postby TJF » May 05, 2011 7:48

Here's my version of the complete macro

Code: Select all

#MACRO drawbackground(dy, dx)
  SELECT CASE AS CONST PPU.nametable
    CASE 0
      calcx = (dx + PPU.hscroll) AND 511
      calcy = (dy + PPU.vscroll) MOD 480 ' table lookup here? (MichaelW)
    CASE 1
      calcx = (dx + 256 + PPU.hscroll) AND 511
      calcy = (dy + PPU.vscroll) MOD 480 ' table lookup?
    CASE 2
      calcx = (dx + PPU.hscroll) AND 511
      calcy = (dy + 240 + PPU.vscroll) MOD 480 ' table lookup?
    CASE ELSE
      calcx = (dx + 256 + PPU.hscroll) AND 511
      calcy = (dy + 240 + PPU.vscroll) MOD 480 ' table lookup?
  END SELECT
  ntaddr = ntmap(-((calcx >= 256) + (calcy >= 240) SHL 1))
  calcx AND= 255
  calcy MOD= 240 ' table lookup here? (MichaelW)

  ntval = VRAM(ntaddr + ((calcy AND 248) SHL 2) + (calcx SHR 3))
  tmpcalcx = 7 - (calcx AND 7)

  patternoffset = PPU.bgtable + (ntval SHL 4) + (calcy AND 7)
  patternoffset = chrbank(patternoffset SHR 10) + (patternoffset AND 1023)

  usebank = patternoffset SHR 10
  usebankmod = patternoffset AND 1023
  pixel = (CHRMEM(patternoffset) SHR tmpcalcx) AND 1

  patternoffset += 8
  usebank = patternoffset SHR 10
  usebankmod = patternoffset AND 1023
  pixel += ((CHRMEM(patternoffset) SHR tmpcalcx) AND 1) SHL 1

  attribx = calcx SHR 5 : attriby = calcy SHR 5
  attribbyte = (VRAM(ntaddr + &h3C0 + (attriby SHL 3) + attribx)) SHR _
               (((calcx AND 16) SHR 3) OR ((calcy AND 16) SHR 2)) AND 3

  pixel += attribbyte SHL 2
  backgnd(dx) = VRAM(&H3F00 + pixel)
#ENDMACRO


NOTE: this version has no usent variable anymore. I don't know if it's used outside the macro.
Mike Chambers
Posts: 85
Joined: Jun 18, 2006 19:48

Postby Mike Chambers » May 05, 2011 16:37

thanks, TJF. i will try that when i get home, i am at work right now. at home i have an old 1.13 GHz P3 laptop that is good to try these optimizations on to see how much faster i can get it running. as it stands, i have to run it on there with a frameskip of 9 (!) to get full speed which as you can guess makes it just about unplayable.

if you want to see what kind of speed it's getting now, here is a ZIP with the compiled win32 exec, SDL.dll, as well as a folder containing all of the source.

http://rubbermallet.org/moarnes-0.11.05.05.zip

i've included a couple games that play particularly well in it. megaman 2 i've played all the way to the end twice in it. :)

run moarnes.exe without any parameters to see the command line help.
Mike Chambers
Posts: 85
Joined: Jun 18, 2006 19:48

Postby Mike Chambers » May 06, 2011 5:17

TJF, your code is very slightly faster than my original pure-FB macro but definitely slower than the assembly version.

also, my original paste was apparently from a slightly old version and the last line: backgnd(dx) = VRAM(&H3F00 + pixel)

actually should have been backgnd(dx) = PPUread(&H3F00 + pixel)

PPUread is a function that performs some checking of the address being read, because certain values can change if they are from certain addresses. (the NES PPU memory map is weird. very weird.)

but calling that function was slowing it WAY down, i moved a couple lines regarding the palette address checking from the PPUread function down to above that backgnd(dx) = blah line do check the value of pixel and then read directly from that VRAM array. it's much faster, but still not amazing.

i guess it's just kind of going to be impossible to hope for too much more speed out of FreeBASIC. i've at least got it running full speed on just about any computer faster than 2.0 to 2.2 GHz or so (depending on exact chip) and that should probably suffice considering modern hardware.

(as long as youre not on a netbook with an atom...)

i think i will see what happens replacing the mod 480 stuff with a lookup table like was suggested, that's a good idea.
TJF
Posts: 3609
Joined: Dec 06, 2009 22:27
Location: N47°, E15°
Contact:

Postby TJF » May 06, 2011 7:02

Mike Chambers wrote:TJF, your code is very slightly faster than my original pure-FB macro but definitely slower than the assembly version.

Thanks for feedback! (Surprising, it works without bugfixing?)

I'm not keen on ASM code, so I can't help any further. But you may optimize by generating an ASM version based on the new FB code?
Mike Chambers
Posts: 85
Joined: Jun 18, 2006 19:48

Postby Mike Chambers » May 06, 2011 7:09

thanks for giving it a shot! :)

and yeah, no bugfixing at all was required.
MichaelW
Posts: 3500
Joined: May 16, 2006 22:34
Location: USA

Postby MichaelW » May 06, 2011 7:16

To get an idea of how much the DIVs are slowing the code down I extracted some representative code and timed it with and without the DIVs.

Code: Select all

'==============================================================================
#include "windows.bi"
#include "counter.bas"
'==============================================================================
'' Counter.bas is available here:
''
'' http://www.freebasic.net/forum/viewtopic.php?t=4221#4221
'==============================================================================

dim as integer calcx,temp1,calcy,temp2,tmpcalcx,ntmapptr,usent,ntaddr

sleep 3000

counter_begin( 1000, HIGH_PRIORITY_CLASS )
counter_end
print counter_cycles;" cycles, empty"

counter_begin( 1000, HIGH_PRIORITY_CLASS )
    Asm
        mov eax, [calcx]
        Add eax, [temp1]
        And eax, 511
        mov [calcx], eax
        mov eax, [calcy]
        Add eax, [temp2]
        mov ebx, 480
        Xor edx, edx
        div eax, ebx
        mov [calcy], edx
        'this code replaces: usent = (calcx Shr 8) + (calcy Shr 8)*2
        mov eax, [calcx]
        Shr eax, 8
        mov ebx, [calcy]
        Shr ebx, 8
        Shl ebx, 1
        Or eax, ebx
        mov [usent], eax
        Shl ebx, 3
        mov eax, [calcy]
        Sub eax, ebx
        mov [calcy], eax
          'this code replaces:
          'calcx = calcx Mod 256
          'calcy = calcy Mod 240
        mov eax, [calcx]
        And eax, 255
        mov [calcx], eax
        mov eax, [calcy]
        mov ebx, 240
        Xor edx, edx
        div eax, ebx
        mov [calcy], edx
          'this code replaces: tmpcalcx = calcx Mod 8
        mov eax, [calcx]
        And eax, 7
        mov [tmpcalcx], eax
          'this code replaces: ntaddr = ntmap(usent)
        mov ebx, [ntmapptr]
        mov esi, [usent]
        shl esi, 2
        ''''mov eax, [ebx+esi]
        mov [ntaddr], eax
          'this code replaces: ntval = VRAM(ntaddr +
          ' ((calcy And 248) Shl 2) + (calcx Shr 3))
        mov eax, [calcy]
        And eax, 248
        Shl eax, 2
        mov ebx, [calcx]
        Shr ebx, 3
        Add eax, ebx
        Add eax, [ntaddr]
        mov [temp1], eax
    End Asm
counter_end
print counter_cycles;" cycles, with div"

counter_begin( 1000, HIGH_PRIORITY_CLASS )
    Asm
        mov eax, [calcx]
        Add eax, [temp1]
        And eax, 511
        mov [calcx], eax
        mov eax, [calcy]
        Add eax, [temp2]
        mov ebx, 480
        Xor edx, edx
        ''''div eax, ebx
        mov [calcy], edx
          'this code replaces: usent = (calcx Shr 8) + (calcy Shr 8)*2
        mov eax, [calcx]
        Shr eax, 8
        mov ebx, [calcy]
        Shr ebx, 8
        Shl ebx, 1
        Or eax, ebx
        mov [usent], eax
        Shl ebx, 3
        mov eax, [calcy]
        Sub eax, ebx
        mov [calcy], eax
          'this code replaces:
          'calcx = calcx Mod 256
          'calcy = calcy Mod 240
        mov eax, [calcx]
        And eax, 255
        mov [calcx], eax
        mov eax, [calcy]
        mov ebx, 240
        Xor edx, edx
        ''''div eax, ebx
        mov [calcy], edx
          'this code replaces: tmpcalcx = calcx Mod 8
        mov eax, [calcx]
        And eax, 7
        mov [tmpcalcx], eax
          'this code replaces: ntaddr = ntmap(usent)
        mov ebx, [ntmapptr]
        mov esi, [usent]
        shl esi, 2
        ''''mov eax, [ebx+esi]
        mov [ntaddr], eax
          'this code replaces: ntval = VRAM(ntaddr +
          ' ((calcy And 248) Shl 2) + (calcx Shr 3))
        mov eax, [calcy]
        And eax, 248
        Shl eax, 2
        mov ebx, [calcx]
        Shr ebx, 3
        Add eax, ebx
        Add eax, [ntaddr]
        mov [temp1], eax
    End Asm
counter_end
print counter_cycles;" cycles, w/o div"

sleep


Running on a P3:

Code: Select all

111 cycles, with div
26 cycles, w/o div
AGS
Posts: 1284
Joined: Sep 25, 2007 0:26
Location: the Netherlands

Postby AGS » May 06, 2011 7:42

It works great on my pc (pentium i7@2 ghz). There is quite a bit of noise on the sound output (you hear some 'cracks' and 'pops' every now and then).

I saw a couple of NES emulators that were able to emulate on a 200mhz machine. But those were programmed in delphi or some other language that compiles to highly optimized code.

Perhaps you could use the -gen gcc -O 2 command line option. Sometimes using the C back end can speed things up (sometimes but not always). Or redesign your program?

Anyway, nice program, nice games (king kong!).
DamageX
Posts: 128
Joined: Nov 21, 2009 8:42

Postby DamageX » May 06, 2011 8:27

I used to play NES games in an emulator called NESticle back in the day which ran full speed on my 486. FB won`t be as fast as the fastest assembly but I`m sure you could do better than needing a 1Ghz+ CPU.

You are calculating the name table address and the character pattern address from scratch for every pixel. This is a lot of redundant calculation. Once you`ve calculated the name table address at the beginning of one scanline you can read the remaining bytes for that line sequentially (except when hittting the boundary between name tables during horizontal scrolling). Also, once you`ve calculated the character pattern address you might as well get all eight pixels from it for that line instead of only one.

BTW, instead of using MOD to make calcy wrap at 240 how about an if-then?

label: if calcy>239 then calcy=calcy-240:goto label

since the maximum possible value is only 239+255 right? (GOTO avoiders can use a while loop or something instead)

Return to “General”

Who is online

Users browsing this forum: No registered users and 19 guests