Cycle count macros for 32 and 64-bit code

Post your FreeBASIC source, examples, tips and tricks here. Please don’t post code without including an explanation.
Post Reply
MichaelW
Posts: 3500
Joined: May 16, 2006 22:34
Location: USA

Cycle count macros for 32 and 64-bit code

Post by MichaelW »

Edit: Replaced the initial 64-bit macros with the most recent code that will work with the version 1.02.0 64-bit compiler, and added a new version that will work with the version 1.03.0 64-bit compiler, and hopefully with future versions.

My most recent 32-bit macros (as previously posted, but with some minor changes):

Code: Select all

''=============================================================================
#include "windows.bi"
''=============================================================================
'' These two macros provide a convenient method of measuring the processor
'' clock-cycle count for a block of code. The macros must be called in pairs,
'' and the block of code, or a call to a procedure containing the block of
'' code, must be placed between the counter_begin and counter_end macro calls.
'' The average per-loop cycle count, corrected for the loop overhead, is
'' returned in the global variable counter_cycles.
''
'' I provided access to the process priority class and the thread priority to
'' make it possible to operate at the highest possible priority by using the
'' combination of REALTIME_PRIORITY_CLASS and THREAD_PRIORITY_TIME_CRITICAL.
'' On a multi-core system (or even a P4 with HT) running Windows XP, doing so
'' appears to be reasonably safe, even if the code being timed triggers an
'' exception. But note that running such a high priority on a single-core
'' system can cause the system to hang.
''
'' Note that CPUID will alter the value of EBX.
''=============================================================================

dim shared as longint counter_cycles
dim shared as integer _loop_count_, _loop_counter_
dim shared as integer _process_priority_class_, _thread_priority_

#macro COUNTER_BEGIN( loop_count, process_priority, thread_priority )
    _loop_count_ = loop_count
    _process_priority_class_ = GetPriorityClass(GetCurrentProcess())
    _thread_priority_ = GetThreadPriority(GetCurrentThread())
    SetPriorityClass(GetCurrentProcess(), process_priority)
    SetThreadPriority(GetCurrentThread(), thread_priority)
    _loop_counter_ = _loop_count_
    asm
        xor   eax, eax
        cpuid
        rdtsc
        push  edx
        push  eax
        xor   eax, eax
        cpuid
        .balign 16
      0:
        sub   DWORD PTR _loop_counter_, 1
        jnz   0b
        xor   eax, eax
        cpuid
        rdtsc
        pop   ecx
        sub   eax, ecx
        pop   ecx
        sbb   edx, ecx
        push  edx
        push  eax
        xor   eax, eax
        cpuid
        rdtsc
        push  edx
        push  eax
        mov   eax, _loop_count_
        mov   _loop_counter_, eax
        xor   eax, eax
        cpuid
        .balign 16
      1:
    end asm
#endmacro

#macro COUNTER_END()
    asm
        sub   DWORD PTR _loop_counter_, 1
        jnz   1b
        xor   eax, eax
        cpuid
        rdtsc
        pop   ecx
        sub   eax, ecx
        pop   ecx
        sbb   edx, ecx
        pop   ecx
        sub   eax, ecx
        pop   ecx
        sbb   edx, ecx
        mov DWORD PTR [counter_cycles], eax
        mov DWORD PTR [counter_cycles+4], edx
    end asm
    SetPriorityClass(GetCurrentProcess(),_process_priority_class_)
    SetThreadPriority(GetCurrentThread(),_thread_priority_)
    counter_cycles /= _loop_count_
#endmacro

''=============================================================================
The new 64-bit macros:

Code: Select all

''=============================================================================
'' This is for FreeBASIC Compiler Version 1.03.0 (07-01-2015), built for win64
''=============================================================================
'' These two macros, which are coded to be -gen gcc / FB 64-bit compatible,
'' provide a convenient method of measuring the processor clock-cycle count
'' for a block of code. The macros must be called in pairs, and the block
'' of code, or a call to a procedure containing the block of code, must be
'' placed between the counter_begin and counter_end macro calls. The average
'' per-loop cycle count, corrected for the loop overhead, is returned in the
'' global variable counter_cycles.
''
'' I provided access to the process priority class and the thread priority to
'' make it possible to operate at the highest possible priority by using the
'' combination of REALTIME_PRIORITY_CLASS and THREAD_PRIORITY_TIME_CRITICAL.
'' On a multi-core system (or even a P4 with HT) running Windows XP, doing so
'' appears to be reasonably safe, even if the code being timed triggers an
'' exception. But note that running such a high priority on a single-core
'' system can cause the system to hang.
''
'' The loops and the cycle-count calculations are done entirely in assembly
'' to avoid problems with compiler optimizations breaking the code.
''
'' Note that CPUID will alter the value of EBX.
''=============================================================================

dim shared as integer counter_cycles
dim shared as long  _loop_count_, _loop_counter_
dim shared as DWORD  _process_priority_class_
dim shared as long   _thread_priority_

#macro COUNTER_BEGIN( loop_count, process_priority, thread_priority )
    _loop_count_ = loop_count
    _process_priority_class_ = GetPriorityClass(GetCurrentProcess())
    _thread_priority_ = GetThreadPriority(GetCurrentThread())
    SetPriorityClass(GetCurrentProcess(), process_priority)
    SetThreadPriority(GetCurrentThread(), thread_priority)
    _loop_counter_ = _loop_count_
    asm
         xor  eax, eax 
         cpuid 
         rdtsc 
         push rdx 
         push rax 
         xor  eax, eax 
         cpuid 
         .balign 16 
       0: 
         sub  DWORD PTR [_loop_counter_], 1 
         jnz  0b 
         xor  eax, eax 
         cpuid 
         rdtsc 
         pop  rcx 
         sub  eax, ecx 
         pop  rcx 
         sbb  edx, ecx 
         push rdx 
         push rax 
         xor  eax, eax 
         cpuid 
         rdtsc 
         push rdx 
         push rax 
         mov  eax, _loop_count_ 
         mov  _loop_counter_, eax 
         xor  eax, eax 
         cpuid 
         .balign 16 
       1: 
    end asm
#endmacro

#macro COUNTER_END
    asm
         sub  DWORD PTR [_loop_counter_], 1 
         jnz  1b 
         xor  eax, eax 
         cpuid 
         rdtsc 
         pop  rcx 
         sub  eax, ecx 
         pop  rcx 
         sbb  edx, ecx 
         pop  rcx 
         sub  eax, ecx 
         pop  rcx 
         sbb  edx, ecx 
         mov  DWORD PTR [counter_cycles], eax 
         mov  DWORD PTR [counter_cycles+4], edx 
    end asm
    SetPriorityClass(GetCurrentProcess(),_process_priority_class_)
    SetThreadPriority(GetCurrentThread(),_thread_priority_)
    counter_cycles /= _loop_count_
#endmacro
The most recent version of the older 64-bit macros:

Code: Select all

''=============================================================================
''
'' THIS IS THE FINAL REVISION FOR THE 1.02.0 WIN64 COMPILER.
''
''=============================================================================
'' These two macros, which are coded to be -gen gcc / FB 64-bit compatible,
'' provide a convenient method of measuring the processor clock-cycle count
'' for a block of code. The macros must be called in pairs, and the block
'' of code, or a call to a procedure containing the block of code, must be
'' placed between the counter_begin and counter_end macro calls. The average
'' per-loop cycle count, corrected for the loop overhead, is returned in the
'' global variable counter_cycles.
''
'' I provided access to the process priority class and the thread priority to
'' make it possible to operate at the highest possible priority by using the
'' combination of REALTIME_PRIORITY_CLASS and THREAD_PRIORITY_TIME_CRITICAL.
'' On a multi-core system (or even a P4 with HT) running Windows XP, doing so
'' appears to be reasonably safe, even if the code being timed triggers an
'' exception. But note that running such a high priority on a single-core
'' system can cause the system to hang.
''
'' The loops and the cycle-count calculations are done entirely in assembly
'' to avoid problems with compiler optimizations breaking the code.
''
'' Note that CPUID will alter the value of EBX.
''=============================================================================

dim shared as integer counter_cycles
dim shared as long    _loop_count_, _loop_counter_
dim shared as DWORD   _process_priority_class_
dim shared as long    _thread_priority_

#macro COUNTER_BEGIN( loop_count, process_priority, thread_priority )
    _loop_count_ = loop_count
    _process_priority_class_ = GetPriorityClass(GetCurrentProcess())
    _thread_priority_ = GetThreadPriority(GetCurrentThread())
    SetPriorityClass(GetCurrentProcess(), process_priority)
    SetThreadPriority(GetCurrentThread(), thread_priority)
    _loop_counter_ = _loop_count_
    asm
        ".intel_syntax noprefix"
        "xor  eax, eax"
        "cpuid"
        "rdtsc"
        "push rdx"
        "push rax"
        "xor  eax, eax"
        "cpuid"
        ".balign 16"
      "0:"
        "sub  DWORD PTR [_LOOP_COUNTER_$], 1"
        "jnz  0b"
        "xor  eax, eax"
        "cpuid"
        "rdtsc"
        "pop  rcx"
        "sub  eax, ecx"
        "pop  rcx"
        "sbb  edx, ecx"
        "push rdx"
        "push rax"
        "xor  eax, eax"
        "cpuid"
        "rdtsc"
        "push rdx"
        "push rax"
        "mov  eax, _LOOP_COUNT_$"
        "mov  _LOOP_COUNTER_$, eax"
        "xor  eax, eax"
        "cpuid"
        ".balign 16"
      "1:"
        ".att_syntax prefix"
    end asm
#endmacro

#macro COUNTER_END
    asm
        ".intel_syntax noprefix"
        "sub  DWORD PTR [_LOOP_COUNTER_$], 1"
        "jnz  1b"
        "xor  eax, eax"
        "cpuid"
        "rdtsc"
        "pop  rcx"
        "sub  eax, ecx"
        "pop  rcx"
        "sbb  edx, ecx"
        "pop  rcx"
        "sub  eax, ecx"
        "pop  rcx"
        "sbb  edx, ecx"
        "mov  [COUNTER_CYCLES$], eax"
        "mov  [COUNTER_CYCLES$+4], edx"
        ".att_syntax prefix"
    end asm
    SetPriorityClass(GetCurrentProcess(),_process_priority_class_)
    SetThreadPriority(GetCurrentThread(),_thread_priority_)
    counter_cycles /= _loop_count_
#endmacro
Some preliminary tests:

Code: Select all

''=============================================================================
#include "counter32.bas"
''=============================================================================

dim as integer x=1,y=2,z=3
dim as long    xL=1,yL=2,zL=3
dim as longint xLi=1,yLi=2,zLi=3
dim as any ptr p1, p2

p1 = allocate(100*4)
p2 = allocate(100*4)

SetProcessAffinityMask( GetCurrentProcess(), 1)

sleep 5000

for i as integer = 1 to 3
    counter_begin(1000000,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
        z = 0
        z = (x*x+y*y)\(x+1)
    counter_end()
    print counter_cycles;" cycles integer"
    counter_begin(1000000,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
        zL = 0
        zL = (xL*xL+yL*yL)\(xL+1)
    counter_end()
    print counter_cycles;" cycles long"
    counter_begin(1000000,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
        zLi = 0
        zLi = (xLi*xLi+yLi*yLi)\(xLi+1)
    counter_end()
    print counter_cycles;" cycles longint"
    print
next

for i as integer = 1 to 3
    counter_begin(1000000,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
        asm
            push  edi
            push  esi
            mov   esi, [p1]
            mov   edi, [p2]
            mov   ecx, 100
            rep   movsd
            pop   esi
            pop   edi
        end asm
    counter_end()
    print counter_cycles;" cycles, rep movsd * 100"
next

sleep

Code: Select all

''=============================================================================

''---------------------------------------------------------------
'' These declarations are the minimum required for this app, to
'' replace the functionality of the windows.bi that is currently
'' missing from the 64-bit version.
''---------------------------------------------------------------

type HANDLE as any ptr
type DWORD as uinteger
type WINBOOL as integer
type BOOL as WINBOOL

#define REALTIME_PRIORITY_CLASS &h00000100
#define THREAD_PRIORITY_TIME_CRITICAL 15

extern "windows" lib "kernel32"
declare function GetCurrentProcess () as HANDLE
declare function GetCurrentThread () as HANDLE
declare function GetPriorityClass (byval as HANDLE) as DWORD
declare function GetThreadPriority (byval as HANDLE) as integer
declare function SetPriorityClass (byval as HANDLE, byval as DWORD) as BOOL
declare function SetThreadPriority (byval as HANDLE, byval as integer) as BOOL
declare function SetProcessAffinityMask (byval as HANDLE, byval as DWORD) as BOOL
end extern

''=============================================================================
#include "counter64.bas"
''=============================================================================

dim as integer x=1,y=2,z=3
dim as long    xL=1,yL=2,zL=3
dim as longint xLi=1,yLi=2,zLi=3
dim shared as any ptr p1, p2

p1 = allocate(100*8)
p2 = allocate(100*8)

SetProcessAffinityMask( GetCurrentProcess(), 1)

sleep 5000

for i as integer = 1 to 3
    counter_begin(1000000,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
        z = 0
        z = (x*x+y*y)\(x+1)
    counter_end
    print counter_cycles;" cycles integer"

    counter_begin(1000000,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
        zL = 0
        zL = (xL*xL+yL*yL)\(xL+1)
    counter_end
    print counter_cycles;" cycles long"

    counter_begin(1000000,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
        zLi = 0
        zLi = (xLi*xLi+yLi*yLi)\(xLi+1)
    counter_end
    print counter_cycles;" cycles longint"
    print
next

for i as integer = 1 to 3
    counter_begin(1000000,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
        asm
            ".intel_syntax noprefix"
            "push rdi"
            "push rsi"
            "mov  esi, P1$"
            "mov  edi, P2$"
            "mov  ecx, 100"
            '' test64.asm:937: Error: same type of prefix used twice for:
            '' "rex64 rep movsq"
            "rep   movsq"
            "pop   rsi"
            "pop   rdi"
            ".att_syntax prefix"
        end asm
    counter_end
    print counter_cycles;" cycles, rep movsq * 100"
next

sleep
And the results compiled with 0.90.1 and FreeBASIC-1.00.0-win64, and running on my Core-i3 + Windows 7-64 system:

Code: Select all

 5 cycles integer
 5 cycles long
 112 cycles longint

 5 cycles integer
 5 cycles long
 112 cycles longint

 5 cycles integer
 5 cycles long
 111 cycles longint

 65 cycles, rep movsd * 100
 64 cycles, rep movsd * 100
 64 cycles, rep movsd * 100

Code: Select all

 25 cycles integer
 28 cycles long
 25 cycles longint

 25 cycles integer
 28 cycles long
 25 cycles longint

 25 cycles integer
 28 cycles long
 25 cycles longint

 75 cycles, rep movsq * 100
 77 cycles, rep movsq * 100
 76 cycles, rep movsq * 100
Last edited by MichaelW on Aug 08, 2015 9:43, edited 6 times in total.
MrSwiss
Posts: 3910
Joined: Jun 02, 2013 9:27
Location: Switzerland

Re: Cycle count macros for 32 and 64-bit code

Post by MrSwiss »

@MichaelW,

what I don't understand is your use of "kernel32" in the 64bit code.
IMHO that should be "kernelbase" ... on 64bit ... ?!?

The confusion is complete when testing a 64bit system (Win8.1) be-
cause of the various DLL's found, in C:\Windows\System32 and also
in C:\Windows\SysWOW64 both having the same DATE and BUILD but
different SIZE and TIMESTAMP ... (ISO-Date: 2014-09-11) ...

regards MrSwiss
MichaelW
Posts: 3500
Joined: May 16, 2006 22:34
Location: USA

Re: Cycle count macros for 32 and 64-bit code

Post by MichaelW »

MrSwiss wrote:@MichaelW,
what I don't understand is your use of "kernel32" in the 64bit code.
My goal here is to compare the execution speed of 32 and 64-bit code, and to do this I needed access to support procedures, and kernel32 was convenient.

According to this, " Processes can load dynamic-link libraries (DLLs) of the same type. For example, a 64-bit application can load a 64-bit DLL but not a 32-bit DLL…"
srvaldez
Posts: 3373
Joined: Sep 25, 2005 21:54

Re: Cycle count macros for 32 and 64-bit code

Post by srvaldez »

besides what MichaelW just said, there seem to be enough hard-corded paths to the win32 system that even now, 64-bit dll's are placed in the systems/win32 folder, and counter-intuitively the 32-bit DLL's are placed in the syswow64 directory.
MrSwiss
Posts: 3910
Joined: Jun 02, 2013 9:27
Location: Switzerland

Re: Cycle count macros for 32 and 64-bit code

Post by MrSwiss »

Thank you both for the replies, it clears the issues above.
Summary:
"kernel32" is in fact the 64bit Library (DLL) that's used
since it is located in C:\Windows\System32 ... very confusing!
srvaldez wrote: ... enough hard-coded paths to the win32 system that even now, 64-bit dll's are placed in the systems/win32 folder, and counter-intuitively the 32-bit DLL's are placed in the syswow64 directory.
I was missled by the assumption, that the 64bit stuff was NEWLY
located in: C:\Windows\System ... as opposed to C:\Windows\System32

hard-coded seems to be the culprit here, IOW: programmer lazyness on the part of M$.
32bit stuff in SysWOW64 ... makes sense to me (Emulator 32bit on 64bit).
The difference C:\Windows\System32 and C:\Windows\System is 100% useless as
there is none!
at least ATM.

High time the guys at MS start cleaning up the mess, they've created !!!
marcov
Posts: 3455
Joined: Jun 16, 2005 9:45
Location: Netherlands
Contact:

Re: Cycle count macros for 32 and 64-bit code

Post by marcov »

MrSwiss wrote:Thank you both for the replies, it clears the issues above.
Summary:
"kernel32" is in fact the 64bit Library (DLL) that's used
since it is located in C:\Windows\System32 ... very confusing!
Just as confusing as the 32-bits being in c:\windows\syswow64? :-)

Seriously, system32 was probably too much hardcoded into too many places to change when they needed 64-bit (which was done for mips and alpha in the nineties). Keep in mind that Microsoft reverse engineered its own Windows sources that had explosively grown between XP and Vista (and partially still ongoing after that).
MrSwiss
Posts: 3910
Joined: Jun 02, 2013 9:27
Location: Switzerland

Re: Cycle count macros for 32 and 64-bit code

Post by MrSwiss »

@marcov,

something in your timeline (above) has to be wrong, because late in 1999: release of Y2K (aka Windows 2000, NT 5.0).
Release XP: ~ 2002 ... (NT 5.1)
The Alpha / MIPS story was sort of "the other way round", running 32bit Intel Code (*.asm) on different CPU-HW.
This was handled, as far as I'm aware of, by HAL (hardware abstraction layer) and driver sub-systems only.
It therefore implies: "No changes to UI / API code", which was the stated goal of the exercise (MS own intention).

Belive me I know, because this Y2K migration thing was my Project in the Company, I was working for at the time, as
a Systems-Admin, as well as my Diploma-Subject in Informatics (BTW: got Diploma ;-) ... can't have been too bad).
marcov
Posts: 3455
Joined: Jun 16, 2005 9:45
Location: Netherlands
Contact:

Re: Cycle count macros for 32 and 64-bit code

Post by marcov »

MrSwiss wrote:@marcov,

something in your timeline (above) has to be wrong, because late in 1999: release of Y2K (aka Windows 2000, NT 5.0).
Release XP: ~ 2002 ... (NT 5.1)
The Alpha / MIPS story was sort of "the other way round", running 32bit Intel Code (*.asm) on different CPU-HW.
Afaik the 32-bit x86 conversion chips and codemorph stuff was itanium, not Alpha, which was much earlier.

As far as I know the timeline is this:

I don't know mips that well, but the Alpha code on w2k was native, not 32-bit intel and 64-bit. The point was that was the old, first "64-bit" Windows. Later, after XP they ported to x86_64 as XP64, which was a standard alone effort. Both the XP64, as the later "proper" NT6 based x86_64 solution inherited from the "big cycle break" reverse engineering effort, but probably kept certain assumptions of the older 64-bit windows ports (Mips, Alpha, and afaik even a Sparc that was never released). It is those assumptions (that the 64-bit system is in system32) that I think still are the reason that system32 now contains 64-bit binaries.
This was handled, as far as I'm aware of, by HAL (hardware abstraction layer) and driver sub-systems only.
It therefore implies: "No changes to UI / API code", which was the stated goal of the exercise (MS own intention).
You are confusing hardware abstraction (as in device drivers) with architecture abstraction (as in the machine language the applications are written in). The Alpha port ran 64-bit services, I don't know if it also ran 64-bits apps.
Belive me I know, because this Y2K migration thing was my Project in the Company, I was working for at the time, as
a Systems-Admin, as well as my Diploma-Subject in Informatics (BTW: got Diploma ;-) ... can't have been too bad).
I was in the w2k migration team for a major university as a student-assistent to the admin, migrating 14000 workplaces.
MrSwiss
Posts: 3910
Joined: Jun 02, 2013 9:27
Location: Switzerland

Re: Cycle count macros for 32 and 64-bit code

Post by MrSwiss »

marcov wrote:You are confusing hardware abstraction (as in device drivers) with architecture abstraction (as in the machine language the applications are written in). The Alpha port ran 64-bit services, I don't know if it also ran 64-bits apps.
I'm not confusing anything. The Layers are top to bottom:

Code: Select all

Applications (OS and/or Middleware specific)
Middleware (DBMS, Messaging, Sync. etc. if needed, otherwise "basics" handled by OS)
OS (WindowsAPI for above layers)
OS-Drivers (native to OS and compatible to HAL)
HAL (translation from/to the "other HALF" can be refered to as "glue")
CPU-Drivers (native to CPU and compatible to HAL)
Hardware (CPU, Chipset, Bus(es) etc.)
Therefore I doubt the possibility of running App's in 64bit, because WIN-API was 32bit.
This is completely irrespective of the CPU-HW native implementation (below HAL).
marcov
Posts: 3455
Joined: Jun 16, 2005 9:45
Location: Netherlands
Contact:

Re: Cycle count macros for 32 and 64-bit code

Post by marcov »

MrSwiss wrote: Therefore I doubt the possibility of running App's in 64bit, because WIN-API was 32bit.
This is completely irrespective of the CPU-HW native implementation (below HAL).
There was a 64-bit port of w2k for Dec Alpha, and it was a beta, and afaik it was 64-bit. But thinking back I only knew the app could use more than 4GB memory. With current knowledge it might have been a 32-bit app that allocated extra segments in memory via PAE.

It was only run for a short while, before reverting back to Redhat, so I don't have that much detail memories about it. IIRC it was said that RH better utilized the hardware (and again, in retrospect that might have been true 64-bit /apps/ vs PAE using)

Wikipedia says:
. The 64-bit versions of Windows NT were originally intended to run on Itanium and DEC Alpha;the latter was used internally at Microsoft during early development of 64-bit Windows.
This seems to indicate that there were at least development versions of 64-bit on Alpha.
MichaelW
Posts: 3500
Joined: May 16, 2006 22:34
Location: USA

Re: Cycle count macros for 32 and 64-bit code

Post by MichaelW »

This code is an attempt to measure the caching effects when traversing a large array in increasing large strides. This code uses the newest 64-bit macros (see first post in this thread), and was tested with Version 1.03.0 (07-01-2015), built for win64 (64bit).

Code: Select all

#include "windows.bi"
#include "counter64.bas"

#define ASIZE 125000000
#define LOOPS 20

redim shared as integer a1(1 to ASIZE)
redim shared as integer a2(1 to ASIZE)

''-------------------------------------------------
'' Avoid problems with the core TSCs being out of
'' sync by restricting the process to single core.
''-------------------------------------------------

SetProcessAffinityMask( GetCurrentProcess(), 1)

sleep 5000

counter_begin(LOOPS,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
    for i as integer = 1 to ASIZE
        a1(i)=i
    next
counter_end
print counter_cycles;chr(9);"cycles, step 1"    

counter_begin(LOOPS,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
    for i as integer = 1 to ASIZE
        a2(i)=i
    next
counter_end
print counter_cycles;chr(9);"cycles, step 1"


counter_begin(LOOPS,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
    for i as integer = 1 to ASIZE step 2
        a1(i)=i
    next
counter_end
print counter_cycles;chr(9);"cycles, step 2"    

counter_begin(LOOPS,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
    for i as integer = 1 to ASIZE step 2
        a2(i)=i
    next
counter_end
print counter_cycles;chr(9);"cycles, step 2"


counter_begin(LOOPS,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
    for i as integer = 1 to ASIZE step 4
        a1(i)=i
    next
counter_end
print counter_cycles;chr(9);"cycles, step 4"    

counter_begin(LOOPS,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
    for i as integer = 1 to ASIZE step 4
        a2(i)=i
    next
counter_end
print counter_cycles;chr(9);"cycles, step 4"


counter_begin(LOOPS,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
    for i as integer = 1 to ASIZE step 8
        a1(i)=i
    next
counter_end
print counter_cycles;chr(9);"cycles, step 8"    

counter_begin(LOOPS,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
    for i as integer = 1 to ASIZE step 8
        a2(i)=i
    next
counter_end
print counter_cycles;chr(9);"cycles, step 8"


counter_begin(LOOPS,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
    for i as integer = 1 to ASIZE step 16
        a1(i)=i
    next
counter_end
print counter_cycles;chr(9);"cycles, step 16"    

counter_begin(LOOPS,REALTIME_PRIORITY_CLASS,THREAD_PRIORITY_TIME_CRITICAL)
    for i as integer = 1 to ASIZE step 16
        a2(i)=i
    next
counter_end
print counter_cycles;chr(9);"cycles, step 16"    
sleep

Code: Select all

 2034320895     cycles, step 1
 2086152558     cycles, step 1
 1103883824     cycles, step 2
 1112968136     cycles, step 2
 915618279      cycles, step 4
 922915336      cycles, step 4
 899494764      cycles, step 8
 914314232      cycles, step 8
 575424043      cycles, step 16
 563708998      cycles, step 16
I use two arrays and alternate between them as a simple way to (hopefully) flush the caches.

As the length of the strides increase the number of accesses decrease in proportion, so at a stride of 16 the number of accesses is 1/16 of the number at a stride of 1, but as you can see the number of cycles to traverse the array is > 1/4 of that required at a stride of 1. The step 4 and step 8 cycle counts were consistently closer together than I expected, perhaps due to some sort of interaction between the size of a cache line and the size of the stride.
Post Reply