By sorting the dictionary so words starting with the same letter are grouped together, and only searching within the group needed, it take 16 seconds, almost down to usable speed.
Here's the test program
Code: Select all
Dim As ULong a,i,b,j
Dim As UByte uu,vv,yy,onetime
Dim as string str1
Dim As Double tt,lt,st
st=timer
Dim As ULong dictcount(255) ' count of words starting with each character
dim as uLong dictptr(255) ' pointer into dict of start of each group
Randomize 0 ' get same string for comparison while testing
str1 = space(10000000)
For a = 0 to 10000000-1 step 4
If a=4 Or a=32 Then ' stick in test strings we can find
str1[a]=97:str1[a+1]=98:str1[a+2]=99:str1[a+3]=100
dictcount(97)+=1 ' count
Else
uu=Int(Rnd*94+33) ' get printable characters for testing, normally int( rnd * 256 )
str1[a] = uu
dictcount(uu)+=1 ' count
str1[a+1]=Int(Rnd*94+33)
str1[a+2]=Int(Rnd*94+33)
str1[a+3]=Int(Rnd*94+33)
EndIf
Next
Print "len(str1)=";Len(str1)
Print "str1="
Print Left(str1,80)
Print
a=dictcount(0)
For i=1 To 255
dictptr(i)=a*4 ' get offset into dict for each character
a=a+(dictcount(i))
Next
If a<>10000000/4 then
Print "error! a=";a
sleep
EndIf
Dim as String dict = ""
Dim as string n1
Dim As ZString Ptr sptr,dptr
Dim As ULong dcount,tempstart,tempsize,dcountx,dptrbase
Dim As ULong dcountz(255) ' current count of string for each character in dict
dict=Space(10000000)
sptr=StrPtr(str1)
dptr=StrPtr(dict)
Dim As ULong dictptrold(255)
dptrbase=Culng(dptr) ' convert from zstring ptr to 32bit integer for adding
For i=0 To 255
dictptrold(i)=dictptr(i) ' save old for testing
dictptr(i)=(dictptr(i))+dptrbase ' get actual address
Next
Print "setup time=";Timer-st
tt=Timer:lt=tt
dcount=0
For a = 0 to (len(str1)-1) Step 4
'n1 = mid( str1 , a , 4 )
uu=str1[a] ' get first char of string
tempstart=dictptr(uu) ' the location within dict that strings starting with uu begin
dcountx=dcountz(uu) ' counts so far for strings starting with this character
Asm
mov eax,[sptr] ' address of next 4byte item in str1 wanted
mov eax,[eax] ' pick up 4-byte string wanted
mov edx,[tempstart] ' start at first word of Dict for the first letter found
mov ecx,[dcountx] ' how many of these we've found so far
cmp ecx,0
je savit ' we don't have any for this letter yet, just go save it
lpstrt:
cmp eax,[edx] ' does the string wanted (in eax) match the word in the dictionary
je dfound ' found it, skip out and set up next one
add edx,4 ' next word of dictionary
dec ecx ' count down available words in dict to test
jnz lpstrt ' still some left, go try next one
savit:
mov [edx],eax ' didn't find, save it in next blank spot of dict
inc dword Ptr [dcountx] ' count strings found for this starting letter
inc dword Ptr [dcount] ' count overall strings found
dfound:
End Asm
dcountz(uu)=dcountx
sptr=sptr+4
b=b+1
If b=100000 Then ' take a peek at results every 100,000 items
lt=Timer-lt
If onetime=0 Then ' print out test val first time
Print "dict="
Print Left(dict,80)
j=dictptrold(97) ' index of start of words starting with "a"
Print mid(dict,j+1,80) ' test print
Print "check to see if second abcd is skipped in dict (col 32)"
print
onetime=1
EndIf
Print a;" dict count=";dcount;" t=";lt;" tt=";timer-tt
b=0
lt=Timer
EndIf
Next
print
Print "final count=";dcount;" in ";Timer-tt;" seconds"
Print "(";2500000-dcount;" duplicates discarded)"
sleep
For i=32 To 127
j=dictptrold(i)
a=dictptrold(i+1)-j
b=dcountz(i)
print i;" count=";b;" size=";a;" size/4=";a/4;" ";Mid(dict,j+1,20)
Next
Sleep
sleep
End