Execution speed of nested loops

New to FreeBASIC? Post your questions here.
fzabkar
Posts: 154
Joined: Sep 29, 2018 2:52
Location: Australia

Re: Execution speed of nested loops

Post by fzabkar »

This code executes in 38s as opposed to 59s:

Code: Select all

    Dim dwSum31 As ULong
    Dim dwSum30 As ULong
    Dim dwSum29 As ULong
    Dim dwSum28 As ULong
    Dim dwSum27 As ULong
    Dim dwSum26 As ULong
    Dim dwSum25 As ULong
    Dim dwSum24 As ULong
    Dim dwSum23 As ULong
    Dim dwSum22 As ULong
    Dim dwSum21 As ULong
    Dim dwSum20 As ULong
    Dim dwSum19 As ULong
    Dim dwSum18 As ULong
    Dim dwSum17 As ULong
    Dim dwSum16 As ULong
    Dim dwSum15 As ULong
    Dim dwSum14 As ULong
    Dim dwSum13 As ULong
    Dim dwSum12 As ULong
    Dim dwSum11 As ULong
    Dim dwSum10 As ULong
    Dim dwSum9 As ULong
    Dim dwSum8 As ULong
    Dim dwSum7 As ULong
    Dim dwSum6 As ULong
    Dim dwSum5 As ULong
    Dim dwSum4 As ULong
    Dim dwSum3 As ULong
    Dim dwSum2 As ULong
    Dim dwSum1 As ULong
    Dim dwSum0 As ULong

    ASM
        mov ecx, [dwNumLongs]
        mov esi, [dwInptr]
L31:    lodsd                    ' load EAX with contents of address pointed to by ESI - ESI increments by 4
        shl eax, 1               ' shift EAX left 1 bit
        jnc L30
        inc dword Ptr [dwSum31]
L30:    shl eax, 1
        jnc L29
        inc dword Ptr [dwSum30]
L29:    shl eax, 1
        jnc L28
        inc dword Ptr [dwSum29]
L28:    shl eax, 1
        jnc L27
        inc dword Ptr [dwSum28]
L27:    shl eax, 1
        jnc L26
        inc dword Ptr [dwSum27]
L26:    shl eax, 1
        jnc L25
        inc dword Ptr [dwSum26]
L25:    shl eax, 1
        jnc L24
        inc dword Ptr [dwSum25]
L24:    shl eax, 1
        jnc L23
        inc dword Ptr [dwSum24]
L23:    shl eax, 1
        jnc L22
        inc dword Ptr [dwSum23]
L22:    shl eax, 1
        jnc L21
        inc dword Ptr [dwSum22]
L21:    shl eax, 1
        jnc L20
        inc dword Ptr [dwSum21]
L20:    shl eax, 1
        jnc L19
        inc dword Ptr [dwSum20]
L19:    shl eax, 1
        jnc L18
        inc dword Ptr [dwSum19]
L18:    shl eax, 1
        jnc L17
        inc dword Ptr [dwSum18]
L17:    shl eax, 1
        jnc L16
        inc dword Ptr [dwSum17]
L16:    shl eax, 1
        jnc L15
        inc dword Ptr [dwSum16]
L15:    shl eax, 1
        jnc L14
        inc dword Ptr [dwSum15]
L14:    shl eax, 1
        jnc L13
        inc dword Ptr [dwSum14]
L13:    shl eax, 1
        jnc L12
        inc dword Ptr [dwSum13]
L12:    shl eax, 1
        jnc L11
        inc dword Ptr [dwSum12]
L11:    shl eax, 1
        jnc L10
        inc dword Ptr [dwSum11]
L10:    shl eax, 1
        jnc L09
        inc dword Ptr [dwSum10]
L09:    shl eax, 1
        jnc L08
        inc dword Ptr [dwSum9]
L08:    shl eax, 1
        jnc L07
        inc dword Ptr [dwSum8]
L07:    shl eax, 1
        jnc L06
        inc dword Ptr [dwSum7]
L06:    shl eax, 1
        jnc L05
        inc dword Ptr [dwSum6]
L05:    shl eax, 1
        jnc L04
        inc dword Ptr [dwSum5]
L04:    shl eax, 1
        jnc L03
        inc dword Ptr [dwSum4]
L03:    shl eax, 1
        jnc L02
        inc dword Ptr [dwSum3]
L02:    shl eax, 1
        jnc L01
        inc dword Ptr [dwSum2]
L01:    shl eax, 1
        jnc L00
        inc dword Ptr [dwSum1]
L00:    shl eax, 1
        jnc Loop
        inc dword Ptr [dwSum0]
Loop:   dec ecx                 ' decrement ECX
        jg L31                  ' jump to L31 if greater than 0
   
    End ASM
    
    uliSumOne32( 0 ) += dwSum0
    uliSumOne32( 1 ) += dwSum1
    uliSumOne32( 2 ) += dwSum2
    uliSumOne32( 3 ) += dwSum3
    uliSumOne32( 4 ) += dwSum4
    uliSumOne32( 5 ) += dwSum5
    uliSumOne32( 6 ) += dwSum6
    uliSumOne32( 7 ) += dwSum7
    uliSumOne32( 8 ) += dwSum8
    uliSumOne32( 9 ) += dwSum9
    uliSumOne32( 10 ) += dwSum10
    uliSumOne32( 11 ) += dwSum11
    uliSumOne32( 12 ) += dwSum12
    uliSumOne32( 13 ) += dwSum13
    uliSumOne32( 14 ) += dwSum14
    uliSumOne32( 15 ) += dwSum15
    uliSumOne32( 16 ) += dwSum16
    uliSumOne32( 17 ) += dwSum17
    uliSumOne32( 18 ) += dwSum18
    uliSumOne32( 19 ) += dwSum19
    uliSumOne32( 20 ) += dwSum20
    uliSumOne32( 21 ) += dwSum21
    uliSumOne32( 22 ) += dwSum22
    uliSumOne32( 23 ) += dwSum23
    uliSumOne32( 24 ) += dwSum24
    uliSumOne32( 25 ) += dwSum25
    uliSumOne32( 26 ) += dwSum26
    uliSumOne32( 27 ) += dwSum27
    uliSumOne32( 28 ) += dwSum28
    uliSumOne32( 29 ) += dwSum29
    uliSumOne32( 30 ) += dwSum30
    uliSumOne32( 31 ) += dwSum31
caseih
Posts: 2157
Joined: Feb 26, 2007 5:32

Re: Execution speed of nested loops

Post by caseih »

fzabkar wrote:It seems that ASM doesn't like arrays.
That is of course nonsense. Memory is an array. An array is just a region of memory you can walk through. But I don't know enough assembly to suggest how you would do that.

But in any case, looks like you've got a solution that works well. I don't think you can get much more speed out of it than you now have.
MrSwiss
Posts: 3910
Joined: Jun 02, 2013 9:27
Location: Switzerland

Re: Execution speed of nested loops

Post by MrSwiss »

Here is the FB way of achieving the same result (sans single-line macro).
Maybe a little slower than ASM but without modification usable in 32/64 bit compiler.
(assembly to stay efficient, whould have to be recoded for 64-bits)
Uses a Union, to determine the bit-state:

Code: Select all

' bin32_u-test1.bas -- (c) 2021-04-03, MrSwiss
'

Union bin32_u                           ' NOTE: little endian
    As ULong    valu                    ' value (to investigate)
    Type
        b_0 : 1 As UByte                ' get bit's state (set = 1 | unset = 0)
        b_1 : 1 As UByte
        b_2 : 1 As UByte
        b_3 : 1 As UByte
        b_4 : 1 As UByte
        b_5 : 1 As UByte
        b_6 : 1 As UByte
        b_7 : 1 As UByte
        b_8 : 1 As UByte
        b_9 : 1 As UByte
        b10 : 1 As UByte
        b11 : 1 As UByte
        b12 : 1 As UByte
        b13 : 1 As UByte
        b14 : 1 As UByte
        b15 : 1 As UByte
        b16 : 1 As UByte
        b17 : 1 As UByte
        b18 : 1 As UByte
        b19 : 1 As UByte
        b20 : 1 As UByte
        b21 : 1 As UByte
        b22 : 1 As UByte
        b23 : 1 As UByte
        b24 : 1 As UByte
        b25 : 1 As UByte
        b26 : 1 As UByte
        b27 : 1 As UByte
        b28 : 1 As UByte
        b29 : 1 As UByte
        b30 : 1 As UByte
        b31 : 1 As UByte
    End Type
End Union


' ===== Test-Union =====
Declare Function bin32(ByVal v As ULong, ByVal b_idx As UByte) As UByte

Dim As ULong arr(0) = { &hFEDCBA98 }    ' one single element, init

For i As UInteger = 0 To 31             ' only inner loop
    If i < 10 Then
        Print "bit  #" & i & " = " & bin32(arr(0), i)
    Else
        Print "bit #" & i & " = " & bin32(arr(0), i)
    End If
Next

Print : Print "... done ... ";

Sleep 
' ===== End Test-Union =====

Function bin32( _                       ' bit check on ULong
    ByVal v     As ULong, _             ' value input
    ByVal b_idx As UByte  _             ' bit index to check (0 to 31)
    ) As UByte                          ' set = 1 | unset = 0
    Dim As bin32_u  u                   ' one instance of union
    Dim As UByte    res                 ' return variable
    u.valu = v                          ' load passed value (ULong)
    Select Case As Const b_idx          ' resolve bit-state
        Case  0 : res = u.b_0
        Case  1 : res = u.b_1
        Case  2 : res = u.b_2
        Case  3 : res = u.b_3
        Case  4 : res = u.b_4
        Case  5 : res = u.b_5
        Case  6 : res = u.b_6
        Case  7 : res = u.b_7
        Case  8 : res = u.b_8
        Case  9 : res = u.b_9
        Case 10 : res = u.b10
        Case 11 : res = u.b11
        Case 12 : res = u.b12
        Case 13 : res = u.b13
        Case 14 : res = u.b14
        Case 15 : res = u.b15
        Case 16 : res = u.b16
        Case 17 : res = u.b17
        Case 18 : res = u.b18
        Case 19 : res = u.b19
        Case 20 : res = u.b20
        Case 21 : res = u.b21
        Case 22 : res = u.b22
        Case 23 : res = u.b23
        Case 24 : res = u.b24
        Case 25 : res = u.b25
        Case 26 : res = u.b26
        Case 27 : res = u.b27
        Case 28 : res = u.b28
        Case 29 : res = u.b29
        Case 30 : res = u.b30
        Case 31 : res = u.b31
        'Case Else : res = 255           ' ERROR checking (larger than 31)
    End Select
    Return res
End Function
' ----- EOF -----
With GCC's optimisations the Function is likely to be 'inlined'.
jj2007
Posts: 2326
Joined: Oct 23, 2016 15:28
Location: Roma, Italia
Contact:

Re: Execution speed of nested loops

Post by jj2007 »

fzabkar wrote:This code executes in 38s as opposed to 59s:

Code: Select all

    Dim dwSum31 As ULong
    Dim dwSum30 As ULong
    Dim dwSum29 As ULong...    uliSumOne32( 30 ) += dwSum30
    uliSumOne32( 31 ) += dwSum31
If you provide a complete example that compiles without errors, I would be tempted to play with it.

Code: Select all

\AllBasics\FreeBasic\tmp\TmpFb.bas(139) error 42: Variable not declared, uliSumOne32
\AllBasics\FreeBasic\tmp\TmpFb.bas(140) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(141) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(142) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(143) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(144) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(145) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(146) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(147) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(148) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(148) error 133: Too many errors, exiting
\AllBasics\FreeBasic\tmp\TmpFb.bas(139) error 42: Variable not declared, uliSumOne32
\AllBasics\FreeBasic\tmp\TmpFb.bas(140) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(141) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(142) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(143) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(144) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(145) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(146) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(147) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(148) error 3: Expected End-of-Line, found 'uliSumOne32'
\AllBasics\FreeBasic\tmp\TmpFb.bas(148) error 133: Too many errors, exiting
fzabkar
Posts: 154
Joined: Sep 29, 2018 2:52
Location: Australia

Re: Execution speed of nested loops

Post by fzabkar »

This is working code which processes the bit frequency in a file:

http://www.users.on.net/~fzabkar/FreeBa ... ount32.bas
http://www.users.on.net/~fzabkar/FreeBa ... ount32.exe

That's what I'm using for my testing (on a 1.2GB file). Note that I have commented out 3 alternative methods.

I'll also test MrSwiss's code as soon as I can.

Edit: MrSwiss's code requires 194 secs.
fzabkar
Posts: 154
Joined: Sep 29, 2018 2:52
Location: Australia

Re: Execution speed of nested loops

Post by fzabkar »

@jj2007, here is my test code. I hope it's what you wanted.

Code: Select all

#include "vbcompat.bi"

dim as ushort i, j, k
dim b as byte
dim uliBit( 0 To 31 ) as ulongint
dim bufptr as any ptr
dim dwbufptr as ulong ptr
dim t as double
dim as ushort imax, jmax, kmax
dim buffsiz as ushort

imax = 99
jmax = 999
buffsiz = 1000
kmax = buffsiz \ 4 - 1

bufptr = allocate( buffsiz )
dwbufptr = bufptr

' randomise timer

for n as integer = 0 to kmax
   dwbufptr[n] = int(rnd * &hFFFFFFFFUL)
next

t = Timer()

for i = 0 to imax
 for j = 0 to jmax
  for k = 0 to kmax
   for b = 0 to 31
    uliBit(b) -= Bit(dwbufptr[k], b)
   next b
  next k
 next j
next i

for b = 0 to 31
 print "frequency of bit #"; b; " = "; uliBit(b) / 250000; " %"
next b

t = Timer() - t

Print
Print "Processing time = "; Format( t, "0.0" ); " seconds"

sleep
jj2007
Posts: 2326
Joined: Oct 23, 2016 15:28
Location: Roma, Italia
Contact:

Re: Execution speed of nested loops

Post by jj2007 »

fzabkar wrote:@jj2007, here is my test code. I hope it's what you wanted.
Yes, thanks, perfect. I invested some efforts in converting the innermost loop to Assembly, but the result is a meagre 20% speed gain with GAS - and it's slower than what Gcc32 produces! Test it yourself:

Code: Select all

#include "vbcompat.bi"

dim as ushort i, j, k
dim b as byte
dim uliBit( 0 To 31 ) as ulongint
print "Size=";sizeof(ulibit)
dim bufptr as any ptr
dim dwbufptr as ulong ptr
dim t as double
dim as ushort imax, jmax, kmax
dim buffsiz as ushort

imax = 99
jmax = 999
buffsiz = 200	' 1000
kmax = buffsiz \ 4 - 1

bufptr = allocate( buffsiz )
dwbufptr = bufptr

' randomise timer

for n as integer = 0 to kmax
   dwbufptr[n] = int(rnd * &hFFFFFFFFUL)
next
t = Timer()
for i = 0 to imax
 for j = 0 to jmax
  for k = 0 to kmax
	asm
	push edi
	mov ecx, 32
	lea edi, [uliBit]	' destination array
	mov eax, [k]	' current k
	mov edx, [dwbufptr]
	mov eax, [edx+4*eax]
loopStart:
	dec ecx
	js loopEnd
	shl eax, 1
	jnc loopStart
	inc dword ptr [edi+8*ecx]
	jmp loopStart
loopEnd:
	pop edi
	end asm
  next k
 next j
next i
for b = 0 to 31
 if b<=2 or b>=30 then print "frequency of bit #"; b; " = "; uliBit(b) / 250000; " %"
next b
t = Timer() - t
Print "Processing time = "; Format( t, "0.0" ); " seconds"
Print

for b = 0 to 31
	uliBit(b)=0
next
t = Timer()
for i = 0 to imax
 for j = 0 to jmax
  for k = 0 to kmax
   for b = 0 to 31
	uliBit(b) -= Bit(dwbufptr[k], b)
   next b
  next k
 next j
next i
for b = 0 to 31
 if b<=2 or b>=30 then print "frequency of bit #"; b; " = "; uliBit(b) / 250000; " %"
next b
t = Timer() - t
Print "Processing time = "; Format( t, "0.0" ); " seconds"
Print

sleep
fzabkar
Posts: 154
Joined: Sep 29, 2018 2:52
Location: Australia

Re: Execution speed of nested loops

Post by fzabkar »

I get 3.7s versus 5.8s.

I'm wondering if I could use the XMM0-15 and YMM0-15 registers for the INCs within my ASM loop, and "MOV dword Ptr [dwSumx], XMMx" outside the loop. (I realise that the op codes may be incorrect -- should I use movups instead of mov?) Would I need to push and pop the registers?

Code: Select all

        xor xmm0, xmm0
        ...
        xor xmm15, xmm15
        xor ymm0, ymm0
        ...
        xor ymm15, ymm15

        mov ecx, [dwNumLongs]
        mov esi, [dwInptr]
L31:    lodsd 
        shl eax, 1
        jnc L30
        inc xmm15
        ...
L16:    shl eax, 1
        jnc L15
        inc xmm0

L15:    lodsd 
        shl eax, 1
        jnc L14
        inc ymm15
        ...
L00:    shl eax, 1
        jnc Loop
        inc ymm0

Loop:   dec ecx
        jg L31

        mov dword Ptr [dwSum31], xmm15
        ...
        mov dword Ptr [dwSum16], xmm0
        
        mov dword Ptr [dwSum15], ymm15
        ...
        mov dword Ptr [dwSum0], ymm0
MrSwiss
Posts: 3910
Joined: Jun 02, 2013 9:27
Location: Switzerland

Re: Execution speed of nested loops

Post by MrSwiss »

Lets face it that, before thinking about code improvements for speed are made:
- using a testcode that produces *reliable results (*correct time taken).

The current testcode produces nonsense because it doesn't really compare properly.
  • includes unwanted 'loop timing' (time taken by i, j and k loops)
  • we only want compared: one implementation (ASM) vs. the other (macro) and nothing more
Therefore use below recoded testcode:

Code: Select all

Dim As ULongInt uliBit(0 To 31)         ' 64 bit
Dim As Double   t, tt                   ' 64 bit
Dim As UInteger i, j, k, b              ' all for-loop iterators 32/64 bit
Dim As ULong Ptr    dwbufptr            ' 32/64 Bit
Dim As UShort   imax, jmax, kmax, buffsiz   ' 16 bit

imax = 99
jmax = 999
buffsiz = 200   ' 1000
kmax = buffsiz \ 4 - 1

dwbufptr = Allocate(buffsiz)

randomize Timer                         ' seed PRNG
For i = 0 To 999                        ' give it a 'spin up' (8000 calls)
    Rnd : Rnd : Rnd : Rnd : _
    Rnd : Rnd : Rnd : Rnd
Next

for n as integer = 0 to kmax            ' initialize buffer
    dwbufptr[n] = CULng(rnd * &h0100000000ull)  ' 2^32 and NOT 2^32 - 1
Next

Print "ArraySize ="; SizeOf(ULongInt) * (UBound(uliBit) + 1) ' calc. array size
Print

' -----------------------------------------------
' First test with assembly (jj2007)
' -----------------------------------------------

For b = 0 To 31                         ' clear result buffer
    uliBit(b) = 0
Next

tt = 0.0                                ' reset total time

For i = 0 To imax
    For j = 0 To jmax
        For k = 0 To kmax
            t = Timer()
            Asm
                push edi
                mov ecx, 32
                lea edi, [uliBit]   ' destination array
                mov eax, [k]   ' current k
                mov edx, [dwbufptr]
                mov eax, [edx+4*eax]
            loopStart:
                dec ecx
                js loopEnd
                Shl eax, 1
                jnc loopStart
                inc dword ptr [edi+8*ecx]
                jmp loopStart
            loopEnd:
                pop edi
            End Asm
            tt += Timer() - t
        Next
    Next
Next

for b = 0 To 31
    If b < 3 Then 
        print "frequency of bit  #"; b; " = "; uliBit(b) / 250000; " %"
    ElseIf b > 28 Then
        print "frequency of bit #"; b; " = "; uliBit(b) / 250000; " %"
    End If
Next
Print "Processing time = "; Fix(tt * 1000); " mS"
Print

' -----------------------------------------------
' Second test with single-line macro 'built-in'
' -----------------------------------------------

For b = 0 To 31                         ' clear result buffer
    uliBit(b) = 0
Next

tt = 0.0                                ' reset total time

For i = 0 To imax
    For j = 0 To jmax
        For k = 0 To kmax
            t = Timer()
            For b = 0 to 31
                uliBit(b) -= Bit(dwbufptr[k], b)
            Next b
            tt += Timer() - t
        Next
    Next
Next

for b = 0 to 31
    If b < 3 Then 
        print "frequency of bit  #"; b; " = "; uliBit(b) / 250000; " %"
    ElseIf b > 28 Then
        print "frequency of bit #"; b; " = "; uliBit(b) / 250000; " %"
    End If
Next
Print "Processing time = "; Fix(tt * 1000); " mS"
Print

DeAllocate(dwbufptr) : dwbufptr = 0

Sleep
The result time is now in mS (instead of sec.).
The results differ notably depending on: -gen gas or -gen gcc (with -O 2).
Tests made with FBC 32-bits ver. 1.07.2 (WIN).
jj2007
Posts: 2326
Joined: Oct 23, 2016 15:28
Location: Roma, Italia
Contact:

Re: Execution speed of nested loops

Post by jj2007 »

fzabkar wrote:I get 3.7s versus 5.8s.

I'm wondering if I could use the XMM0-15 and YMM0-15 registers for the INCs within my ASM loop
That would make sense for the innermost loop, if there was a SIMD instruction that could parallelise the bit testing. I am not sure, but I doubt there is one. I found this article dealing with a very similar issue, but I haven't been able to find the instruction they use. Have a look at ktestb, too.
Post Reply