I ported it to FreeBasic, because it is very simple and I think it is also instructive.
This kind of compression works using a fixed dictionary (that has been created by the original author using a script that analyzed frequencies of each sequence in most common English texts), and looks if any of those sequences is present inside the string to compress. If the sequence is present, it is replaced by a single byte that states the position of that sequence in the dictionary (for that reason the dictionary cannot have more than 256 items). Otherwise, characters 254 or 255 are used to tell that the next part must not be compressed.
It is not as effective as the DEFLATE algorithm used by zlib, but it's able to work even on very short strings, since it needs no headers or tabled in the compressed string.
The compressor is this:
Code: Select all
function smaz_compress(in as string) as string
dim as const string Smaz_cb(241)={!"\2s,\182", !"\3had\154\2leW", !"\3on \142", "", !"\1yS",_
!"\2ma\173\2li\151", !"\3or \176", "", !"\2ll\152\3s t\191",_
!"\4fromg\2mel", "", !"\3its\218", !"\1z\219", !"\3ingF", !"\1>\222",_
!"\1 "+chr(0)+!"\3 (\2nc\228", !"\2nd=\3 on\202",_
!"\2ne\139\3hat\190\3re q", "", !"\2ngT\3herz\4have\198\3s o\149",_
"", !"\3ionk\3s a\172\2ly\234", !"\3hisL\3 inN\3 be\170", "",_
!"\3 fo\213\3 of \3 ha\201", "", !"\2of\5",_
!"\3 co\161\2no\183\3 ma\248", "", "", !"\3 cl\238\3enta\3 an7",_
!"\2ns\192\1\"e", !"\3n t\143\2ntP\3s, \133",_
!"\2pe\208\3 we\233\2om\147", !"\2on\31", "", !"\2y G", !"\3 wa\185",_
!"\3 re\209\2or*", "", !"\2=\"\251\2ot\223", !"\3forD\2ou[",_
!"\3 toR", !"\3 th\r", !"\3 it\246",_
!"\3but\177\2ra\130\3 wi\243\2</\241", !"\3 wh\159", !"\2 4",_
!"\3nd ?", !"\2re!", "", !"\3ng c", "",_
!"\3ly \199\3ass\211\1a\4\2rir", "", "", "", !"\2se_", !"\3of \"",_
!"\3div\244\2ros\3ere\160", "", !"\2ta\200\1bZ\2si\212", "",_
!"\3and\a\002rs\221", !"\2rt\242", !"\2teE", !"\3ati\206", !"\2so\179",_
!"\2th\17", !"\2tiJ\1c\28\3allp", !"\3ate\229", !"\2ss\166",_
!"\2stM", "", !"\2><\230", !"\2to\20", !"\3arew", !"\1d\24",_
!"\2tr\195", "", !"\1\n1\003 a \146", !"\3f tv\2veo", !"\2un\224", "",_
!"\3e o\162", !"\2a \163\2wa\214\1e\2", !"\2ur\150\3e a\188",_
!"\2us\164\3\n\r\n\247", !"\2ut\196\3e c\251", !"\2we\145", "", "",_
!"\2wh\194", !"\1f,", "", "", "", !"\3d t\134", "", "", !"\3th \227",_
!"\1g;", "", "", !"\1\r9\003e s\181", !"\3e t\156", "", !"\3to Y",_
!"\3e\r\n\158", !"\2d \30\1h\18", "", !"\1,Q", !"\2 a\25", !"\2 b^",_
!"\2\r\n\21\2 cI", !"\2 d\165", !"\2 e\171", !"\2 fh\1i\b\002e \v",_
"", !"\2 hU\1-\204", !"\2 i8", "", "", !"\2 l\205", !"\2 m{",_
!"\2f :\2 n\236", !"\2 o\29", !"\2 p}\1.n\3\r\n\r\250", "",_
!"\2 r\189", !"\2 s>", !"\2 t\14", "", !"\2g \157\5which+\3whi\247",_
!"\2 w5", !"\1/\197", !"\3as \140", !"\3at \135", "", !"\3who\217", "",_
!"\1l\22\2h \138", "", !"\2, $", "", !"\4withV", "", "", "", !"\1m-", "",_
"", !"\2ac\239", !"\2ad\232", !"\3TheH", "", "", !"\4this\155\1n\t",_
"", !"\2. y", "", !"\2alX\3e, \245", !"\3tio\141\2be\\",_
!"\2an\26\3ver\231", "", !"\4that0\3tha\203\1o\6", !"\3was2",_
!"\2arO", !"\2as.", !"\2at'\3the\1\4they\128\5there\210\5theird",_
!"\2ce\136", !"\4were]", "", !"\2ch\153\2l \180\1p<", "", "",_
!"\3one\174", "", !"\3he \19\2dej", !"\3ter\184", !"\2cou", "",_
!"\2by\127\2di\129\2eax", "", !"\2ec\215", !"\2edB", !"\2ee\235", "",_
"", !"\1r\f\002n )", "", "", "", !"\2el\178", "", !"\3in i\2en3", "",_
!"\2o `\1s\n", "", !"\2er\27", !"\3is t\2es6", "", !"\2ge\249",_
!"\4.com\253", !"\2fo\220\3our\216", !"\3ch \193\1t\3", !"\2hab", "",_
!"\3men\252", "", !"\2he\16", "", "", !"\1u&", !"\2hif", "",_
!"\3not\132\2ic\131", !"\3ed @\2id\237", "", "", !"\2ho\187",_
!"\2r K\1vm", "", "", "", !"\3t t\175\2il\240", !"\2im\226",_
!"\3en \207\2in\15", !"\2io\144", !"\2s \23\1wA", "", !"\3er |",_
!"\3es ~\2is%", !"\2it/", "", !"\2iv\186", "",_
!"\2t #\ahttp://C\1x\250", !"\2la\137", !"\1<\225", !"\3, a\148"}
dim as unsigned integer h1, h2, h3
dim verb as string
dim o as string
dim p as integer=1
do until p>len(in)
dim j as integer
dim slot as string
j=7
h1=asc(mid(in,p,1)) shl 3
h2=h1+asc(mid(in,p+1,1))
h3=h2 xor asc(mid(in,p+2,1))
' Try to lookup substrings into the hash table, starting from the
' longer to the shorter substrings
do while j>0
select case j
case 1: slot = Smaz_cb(h1 mod 241)
case 2: slot = Smaz_cb(h2 mod 241)
case else: slot = Smaz_cb(h3 mod 241)
end select
do while slot<>""
if asc(mid(slot,1,1))=j andalso mid(slot,2,j)=mid(in,p,j) then
' Match found in the hash table,
' prepare a verbatim bytes flush if needed
if verb<>"" then
if len(verb)=1 then
o=o+chr(254)
else
o=o+chr(255)+chr(len(verb)-1)
end if
o=o+verb
verb=""
end if
' Emit the byte
o=o+mid(slot,asc(left(slot,1))+2,1)
p+=j
goto nextbyte
else
slot =mid(slot,3+asc(left(slot,1)))
end if
loop
j-=1
loop
' Match not found - add the byte to the verbatim buffer
verb = verb + mid(in, p, 1)
p+=1
nextbyte:
' Perform a flush if we reached the flush length limit, and there
if len(verb)>=256 orelse p>len(in) and verb<>"" then
o=o+chr(255)+chr(len(verb)-1)
o=o+verb
verb=""
end if
loop
return o
end function
And here is the decompressor (much simpler)
Code: Select all
function smaz_decompress(in as string) as string
dim as string Smaz_rcb(254) = {" ", "the", "e", "t", "a", "of", "o", "and", "i", "n", "s", "e ", "r", " th",_
" t", "in", "he", "th", "h", "he ", "to", !"\r\n", "l", "s ", "d", " a", "an",_
"er", "c", " o", "d ", "on", " of", "re", "of ", "t ", ", ", "is", "u", "at",_
" ", "n ", "or", "which", "f", "m", "as", "it", "that", !"\n", "was", "en",_
" ", " w", "es", " an", " i", !"\r", "f ", "g", "p", "nd", " s", "nd ", "ed ",_
"w", "ed", "http://", "for", "te", "ing", "y ", "The", " c", "ti", "r ", "his",_
"st", " in", "ar", "nt", ",", " to", "y", "ng", " h", "with", "le", "al", "to ",_
"b", "ou", "be", "were", " b", "se", "o ", "ent", "ha", "ng ", "their", !"\"",_
"hi", "from", " f", "in ", "de", "ion", "me", "v", ".", "ve", "all", "re ",_
"ri", "ro", "is ", "co", "f t", "are", "ea", ". ", "her", " m", "er ", " p",_
"es ", "by", "they", "di", "ra", "ic", "not", "s, ", "d t", "at ", "ce", "la",_
"h ", "ne", "as ", "tio", "on ", "n t", "io", "we", " a ", "om", ", a", "s o",_
"ur", "li", "ll", "ch", "had", "this", "e t", "g ", !"e\r\n", " wh", "ere",_
" co", "e o", "a ", "us", " d", "ss", !"\n\r\n", !"\r\n\r", !"=\"", " be", " e",_
"s a", "ma", "one", "t t", "or ", "but", "el", "so", "l ", "e s", "s,", "no",_
"ter", " wa", "iv", "ho", "e a", " r", "hat", "s t", "ns", "ch ", "wh", "tr",_
"ut", "/", "have", "ly ", "ta", " ha", " on", "tha", "-", " l", "ati", "en ",_
"pe", " re", "there", "ass", "si", " fo", "wa", "ec", "our", "who", "its", "z",_
"fo", "rs", ">", "ot", "un", "<", "im", "th ", "nc", "ate", "><", "ver", "ad",_
" we", "ly", "ee", " n", "id", " cl", "ac", "il", "</", "rt", " wi", "div",_
"e, ", " it", "whi", " ma", "ge", "x", "e c", "men", ".com"}
dim o as string
dim c as string
dim p as integer=1
do until p>len(in)
c=mid(in,p,1)
if c=chr(254) then 'verbatim byte
o=o+mid(in,p+1,1)
p+=2
elseif c=chr(255) then 'verbatim string
dim l as integer=asc(mid(in,p+1,1))+1
o=o+mid(in,p+2,l)
p+=2+l
else 'Codebook entry
o=o+Smaz_rcb(asc(c))
p+=1
end if
loop
return o
end function