Integrated into cmdsqlite
See: https://github.com/thrive4/util.fb.cmdsqlite
So this started out as an example of using the
base64 decode snippet I posted earlier:
viewtopic.php?f=7&t=29771
In essence this is some code that rather roughly converts
files formatted in mhtml aka X-MAF to html or text and decodes
the mime / base64 encoded files to images, css, etc
plus alters the image anchors.
The resulting media and html or text is placed in
a folder corresponding to the <filename>.
Usage: -txt or -html <filename>.mht
More info regarding mhtml / mht can be found here:
https://docs.fileformat.com/web/mht/
https://en.wikipedia.org/wiki/MHTML
And just for the record word wrap, used for text
conversion, is extremely rough plus it is a devilishly
hard problem once you realize just breaking up
a line on a delimiter (mostly space) just does not
cut it.
https://en.wikipedia.org/wiki/Line_wrap_and_word_wrap
Code: Select all
' convert mht to html or txt and media from base64 encoded strings
' based on various code sources supplemented by Thrive4 2021
' setup image or file input
dim filename as string = command(1)
dim itemnr as integer = 1
dim listitem as string
dim texttype as string = "-html"
dim i as integer = 1
' setup text
dim chkcontenttype as boolean = false
dim tempfolder as string
dim textfile as string
Dim msg64 As String
dim textitem as string
dim chkhtml as boolean = false
dim linelength as integer = 72
' parse arguments
dim as boolean validarg = false
if command(1) = "/?" or command(1) = "-man" then
print "convert .mht file to text or html"
print "usage: -txt or -html <filename>.mht"
end
end if
select case command(1)
case "-txt"
validarg = true
case "-html"
validarg = true
case else
print "error: invalid switch " + command(1) + " valid switches are -txt or -html"
end
end select
if instr(command(2), ".mht") <> 0 and validarg then
filename = command(2)
texttype = command(1)
validarg = true
else
print "error: file " + command(2) + " not found or supported"
end
end if
tempfolder = mid(filename, instrrev(filename, "\"))
tempfolder = exepath + mid(tempfolder, 1, instrrev(tempfolder, ".") - 1)
Dim Shared As String B64
B64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" & _
"abcdefghijklmnopqrstuvwxyz" & _
"0123456789+/"
Function MIMEDecode(s As String ) As Integer
If Len(s) Then
MIMEdecode = Instr(B64,s) - 1
Else
MIMEdecode = -1
End If
End Function
Function Decode64(s As String) As String
Dim As Integer w1, w2, w3, w4
Dim As String mD
For n As Integer = 1 To Len(s) Step 4
w1 = MIMEdecode(Mid(s,n+0,1))
w2 = MIMEdecode(Mid(s,n+1,1))
w3 = MIMEdecode(Mid(s,n+2,1))
w4 = MIMEdecode(Mid(s,n+3,1))
If w2 >-1 Then mD+= Chr(((w1* 4 + Int(w2/16)) And 255))
If w3 >-1 Then mD+= Chr(((w2*16 + Int(w3/ 4)) And 255))
If w4 >-1 Then mD+= Chr(((w3*64 + w4 ) And 255))
Next n
Return mD
End Function
function replace(byref haystack as string, byref needle as string, byref substitute as string) as string
'found at https://freebasic.net/forum/viewtopic.php?f=2&t=9971&p=86259&hilit=replace+character+in+string#p86259
dim as string temphaystack = haystack
dim as integer fndlen = len(needle), replen = len(substitute)
dim as integer i = instr(temphaystack, needle)
while i
temphaystack = left(temphaystack, i - 1) & substitute & mid(temphaystack, i + fndlen)
i = instr(i + replen, temphaystack, needle)
wend
return temphaystack
end function
function striphtmltags(html as string) as string
' found at https://www.freevbcode.com/ShowCode.asp?ID=1037
dim bpos as integer = InStr(html, "<")
dim epos as integer = InStr(html, ">")
dim dummy as string
Do While bpos <> 0 And epos <> 0 And epos > bpos
dummy = Mid(html, bpos, epos - bpos + 1)
html = replace(html, dummy, "")
bpos = InStr(html, "<")
epos = InStr(html, ">")
Loop
' Translate common escape sequence chars
html = Replace(html, " ", " ")
html = Replace(html, "&", "&")
html = Replace(html, """, "'")
html = Replace(html, "&#", "#")
html = Replace(html, "<", "<")
html = Replace(html, ">", ">")
html = Replace(html, "%20", " ")
html = LTrim(Trim(html))
return html
end function
Function replaceimageanchor(haystack As String, needle As String) As Integer
' based on https://rosettacode.org/wiki/Count_occurrences_of_a_substring#FreeBASIC
If haystack = "" OrElse needle = "" Then Return 0
Dim As Integer count = 0, length = Len(needle)
dim dummy as string
For i As Integer = 1 To Len(haystack)
If Mid(haystack, i, length) = needle Then
dummy = Mid(haystack, i, (instr(i, haystack, ".") + 4) - i)
haystack = replace(haystack, Mid(haystack, i, (instr(i, haystack, ".") + 4) - i), chr$(34) + mid(dummy, instrrev(dummy, "/") + 1))
count += 1
i += length - 1
End If
Next
Return count
End Function
Sub Split(array() As String, text As String, wrapchar As String = " ")
Dim As Integer bpos, epos, toks
Dim As String tok
Redim array(toks)
Do While Strptr(text)
epos = Instr(bpos + 1, text, wrapchar)
array(toks) = Mid(text, bpos + 1, epos - bpos - 1)
If epos = FALSE Then Exit Do
toks += 1
Redim Preserve array(toks)
bpos = epos
Loop
End Sub
' from https://rosettacode.org/wiki/Word_wrap#FreeBASIC translated
' very rough wordwrap results in poor readable text
Dim Shared As String array()
function wordwrap(text As String, n As Integer) as string
Split(array(), text, " ")
Dim lineitem As String = ""
dim temptxt as string = ""
For i As Integer = 0 To Ubound(array)
If Len(lineitem) = 0 Then
lineitem = lineitem & array(i)
Elseif Len(lineitem & " " & array(i)) <= n Then
lineitem = lineitem & " " & array(i)
Else
' special case no space in line chop in two
if len(lineitem) > n then
lineitem = mid(lineitem, 1, fix(n / 1.2)) + chr$(13) + chr$(10)_
+ mid(lineitem, fix(n / 1.2), len(lineitem))
end if
temptxt = temptxt + lineitem + chr$(13) + chr$(10)
lineitem = array(i)
End If
Next i
If Len(lineitem) > 0 Then
temptxt = temptxt + lineitem + chr$(13) + chr$(10)
return temptxt
end if
End function
' decode a base64 encoded file
if filename <> "" then
if mkdir(tempfolder) < 0 then
print "error: could not create folder " + tempfolder
end if
msg64 = ""
textitem = ""
select case texttype
case "-txt"
textfile = tempfolder + "\" + mid(command(2), instrrev(filename, "\"), instrrev(command(2), ".") - 1) + ".txt"
case "-html"
textfile = tempfolder + "\" + mid(command(2), instrrev(filename, "\"), instrrev(command(2), ".") - 1) + ".html"
end select
Open filename For input As 1
open textfile for output as 3
Do Until EOF(1)
' stop decoding
Line Input #1, listitem
' special case remove %2520 used in filenames images
listitem = Replace(listitem, "%2520", "")
' filter out mht header for html
select case texttype
case "-txt"
'nop
case "-html"
if instr(listitem, "<html") = 0 and chkhtml = false then
listitem = ""
else
chkhtml = true
end if
end select
if instr(listitem, "------=_NextPart") > 0 then
Print #2, Decode64(msg64)
chkcontenttype = false
msg64 = ""
close (2)
end if
' start decoding
select case true
case instr(listitem, "Content-Type: image") > 0
chkcontenttype = true
case instr(listitem, "Content-Type: text/javascript") > 0
chkcontenttype = true
case instr(listitem, "Content-Type: text/css") > 0
chkcontenttype = true
case instr(listitem, "Content-Type: font") > 0
chkcontenttype = true
end select
if chkcontenttype then
if instr(listitem, "Content-Location:") > 0 then
' output decoded images to a temp dir
open tempfolder + "\" + mid(listitem, instrrev(listitem, "/") + 1) for output as 2
end if
' ghetto validation base64
select case true
case instr(listitem, " ") > 0
'nop
case instr(listitem, "-") > 0
'nop
case instr(listitem, ":") > 0
'nop
case instr(listitem, "%") > 0
'nop
case len(listitem) = 0
'nop
case else
msg64 = msg64 + listitem
end select
end if
if chkcontenttype = false then
select case true
case instr(listitem, "------=_NextPart") > 0
listitem = ""
case instr(listitem, "Content-Type:") > 0
listitem = ""
case instr(listitem, "Content-Transfer-Encoding:") > 0
listitem = ""
case instr(listitem, "Content-Location:") > 0
listitem = ""
end select
' special cases mht
' remove frontpage thing sticks = to end of line
if mid(listitem, len(listitem)) = "=" then
listitem = mid(listitem, 1, len(listitem) - 1)
end if
select case texttype
case "-txt"
IF LEN(listitem) > 1 then
textitem = textitem + trim(listitem)
end if
case "-html"
textitem = textitem + listitem
end select
end if
itemnr += 1
Loop
' generic replace for text and html
textitem = Replace(textitem, " ", "")
textitem = Replace(textitem, "=A0", " ")
textitem = Replace(textitem, "=20", " ")
textitem = Replace(textitem, "=3D", "=")
textitem = Replace(textitem, "=09", " ")
textitem = Replace(textitem, "=C2", " ")
textitem = Replace(textitem, "=F6", "")
textitem = Replace(textitem, "=E2=80=93", "-")
textitem = replace(textitem, "=E2=80=99", "")
textitem = replace(textitem, chr$(9), "")
select case texttype
case "-txt"
textitem = striphtmltags(textitem)
textitem = Replace(textitem, " ", "")
textitem = wordwrap(textitem, linelength)
case "-html"
textitem = Replace(textitem, "=A0", " ")
textitem = Replace(textitem, "=20", " ")
print "nr image anchors changed: " & replaceimageanchor(textitem, chr$(34) + "file:///")
end select
print #3, textitem
close
end if
end