Code: Select all
Dim as long i, dwNumLongs=99999999
Dim as long dwinptrB(dwNumLongs)
Dim as long dwinptrS(dwNumLongs)
Dim t as double
For i = 0 To dwNumLongs - 1
dwinptrB(i)=Rnd()*12345
dwinptrS(i)=Rnd()*12345
Next i
print i; " elements"
for outerloop As integer=1 to 5
t=Timer()
#if 0
Print "Gcc: ";
' asm int 3
For i = 0 To dwNumLongs - 1
dwinptrB(i) Or = dwinptrS(i)
' asm nop
Next i
#else
Print "asm: ";
Dim as integer ps, pd
ps=@dwinptrS(0)
pd=@dwinptrB(0)
asm
' int 3
mov ecx, [dwNumLongs]
mov esi, [ps]
mov edi, [pd]
L0: dec ecx
js L1
mov eax, [esi+4*ecx]
or [edi+4*ecx], eax
jmp L0
L1:
end asm
#endif
t=Timer()-t
print int(t*1000); " ms for or'ing"; dwNumLongs; " elements"
Next
For i = 0 To 9
print dwinptrB(i)
Next
sleep
Results:
Code: Select all
asm: 133 ms for or'ing 99999999 elements
asm: 148 ms for or'ing 99999999 elements
asm: 136 ms for or'ing 99999999 elements
asm: 125 ms for or'ing 99999999 elements
asm: 124 ms for or'ing 99999999 elements
Gcc: 157 ms for or'ing 99999999 elements
Gcc: 142 ms for or'ing 99999999 elements
Gcc: 152 ms for or'ing 99999999 elements
Gcc: 128 ms for or'ing 99999999 elements
Gcc: 131 ms for or'ing 99999999 elements
Gas: 353 ms for or'ing 99999999 elements
Gas: 349 ms for or'ing 99999999 elements
Gas: 342 ms for or'ing 99999999 elements
Gas: 351 ms for or'ing 99999999 elements
Gas: 333 ms for or'ing 99999999 elements
Gcc32, under the hood:
Code: Select all
lea esi, [esi] ; align 16
L0:
mov edx, [local.23]
add edx, eax ; addr dest
mov ecx, [local.15]
mov ecx, [eax+ecx] ; val src
or [edx], ecx
add eax, 4
cmp eax, 17D783FC
jne short L0
20% faster:
Code: Select all
asm
mov ecx, [dwNumLongs]
mov esi, [ps]
mov edi, [pd]
dec ecx
L0: mov eax, [esi+4*ecx]
or [edi+4*ecx], eax
dec ecx
jns L0
end asm
In any case, parallel processing with SIMD or AVX would be a lot faster. Try this one (a factor 5 faster than
Gcc with -O3):
Code: Select all
Dim as integer ps, pd
ps=@dwinptrS(0)
pd=@dwinptrB(0)
asm
mov ecx, [dwNumLongs]
lea ecx, [ecx-16]
mov esi, [ps]
mov edi, [pd]
L0: movups xmm0, [esi+ecx]
movups xmm1, [edi+ecx]
por xmm0, xmm1
movups [edi+ecx], xmm0
sub ecx, 16
jns L0
end asm