mht 2 html

Post your FreeBASIC source, examples, tips and tricks here. Please don’t post code without including an explanation.
Post Reply
thrive4
Posts: 70
Joined: Jun 25, 2021 15:32

mht 2 html

Post by thrive4 »

further updates:
Integrated into cmdsqlite
See: https://github.com/thrive4/util.fb.cmdsqlite

So this started out as an example of using the
base64 decode snippet I posted earlier:
viewtopic.php?f=7&t=29771

In essence this is some code that rather roughly converts
files formatted in mhtml aka X-MAF to html or text and decodes
the mime / base64 encoded files to images, css, etc
plus alters the image anchors.

The resulting media and html or text is placed in
a folder corresponding to the <filename>.

Usage: -txt or -html <filename>.mht

More info regarding mhtml / mht can be found here:
https://docs.fileformat.com/web/mht/
https://en.wikipedia.org/wiki/MHTML

And just for the record word wrap, used for text
conversion, is extremely rough plus it is a devilishly
hard problem once you realize just breaking up
a line on a delimiter (mostly space) just does not
cut it.

https://en.wikipedia.org/wiki/Line_wrap_and_word_wrap

Code: Select all

' convert mht to html or txt and media from base64 encoded strings
' based on various code sources supplemented by Thrive4 2021

' setup image or file input
dim filename as string = command(1)
dim itemnr as integer = 1
dim listitem as string
dim texttype as string = "-html"
dim i as integer = 1

' setup text
dim chkcontenttype as boolean = false
dim tempfolder as string
dim textfile as string
Dim msg64 As String 
dim textitem as string
dim chkhtml as boolean = false
dim linelength as integer = 72

' parse arguments
dim as boolean validarg = false
if command(1) = "/?" or command(1) = "-man" then
    print "convert .mht file to text or html"
    print "usage: -txt or -html <filename>.mht"
    end
end if
select case command(1)
    case "-txt"
        validarg = true
    case "-html"
        validarg = true
    case else
        print "error: invalid switch " + command(1) + " valid switches are -txt or -html"
        end
end select
if instr(command(2), ".mht") <> 0 and validarg then        
    filename = command(2)
    texttype = command(1)
    validarg = true
else
    print "error: file " + command(2) + " not found or supported"
    end
end if

tempfolder = mid(filename, instrrev(filename, "\"))
tempfolder = exepath + mid(tempfolder, 1, instrrev(tempfolder, ".") - 1)

Dim Shared As String B64
B64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" & _
"abcdefghijklmnopqrstuvwxyz" & _
"0123456789+/"

Function MIMEDecode(s As String ) As Integer
    If Len(s) Then
        MIMEdecode = Instr(B64,s) - 1
    Else
        MIMEdecode = -1
    End If
End Function

Function Decode64(s As String) As String
    Dim As Integer w1, w2, w3, w4
    Dim As String  mD
    For n As Integer = 1 To Len(s) Step 4
        w1 = MIMEdecode(Mid(s,n+0,1))
        w2 = MIMEdecode(Mid(s,n+1,1))
        w3 = MIMEdecode(Mid(s,n+2,1))
        w4 = MIMEdecode(Mid(s,n+3,1))
        If w2 >-1 Then mD+= Chr(((w1* 4 + Int(w2/16)) And 255))
        If w3 >-1 Then mD+= Chr(((w2*16 + Int(w3/ 4)) And 255))
        If w4 >-1 Then mD+= Chr(((w3*64 + w4        ) And 255))
    Next n
    Return mD
End Function

function replace(byref haystack as string, byref needle as string, byref substitute as string) as string
'found at https://freebasic.net/forum/viewtopic.php?f=2&t=9971&p=86259&hilit=replace+character+in+string#p86259   
    dim as string temphaystack = haystack
    dim as integer fndlen = len(needle), replen = len(substitute)
    dim as integer i = instr(temphaystack, needle)
   
    while i
        temphaystack = left(temphaystack, i - 1) & substitute & mid(temphaystack, i + fndlen)
        i = instr(i + replen, temphaystack, needle)
    wend
   
    return temphaystack

end function

function striphtmltags(html as string) as string
' found at https://www.freevbcode.com/ShowCode.asp?ID=1037

    dim bpos as integer = InStr(html, "<")
    dim epos as integer = InStr(html, ">")
    dim dummy as string
    
    Do While bpos <> 0 And epos <> 0 And epos > bpos
          dummy = Mid(html, bpos, epos - bpos + 1)
          html = replace(html, dummy, "")
          bpos = InStr(html, "<")
          epos = InStr(html, ">")
    Loop

    ' Translate common escape sequence chars
    html = Replace(html, "&nbsp;", " ")
    html = Replace(html, "&", "&")
    html = Replace(html, """, "'")
    html = Replace(html, "&#", "#")
    html = Replace(html, "<", "<")
    html = Replace(html, ">", ">")
    html = Replace(html, "%20", " ")
    html = LTrim(Trim(html))

    return html

end function

Function replaceimageanchor(haystack As String, needle As String) As Integer
' based on https://rosettacode.org/wiki/Count_occurrences_of_a_substring#FreeBASIC
  If haystack = "" OrElse needle = "" Then Return 0
  Dim As Integer count = 0, length = Len(needle)
  dim dummy as string  
  For i As Integer = 1 To Len(haystack)
    If Mid(haystack, i, length) = needle Then
    dummy = Mid(haystack, i, (instr(i, haystack, ".") + 4) - i)
    haystack = replace(haystack, Mid(haystack, i, (instr(i, haystack, ".") + 4) - i), chr$(34) + mid(dummy, instrrev(dummy, "/") + 1))
      count += 1
      i += length - 1
    End If
  Next
  Return count
End Function

Sub Split(array() As String, text As String, wrapchar As String = " ")
    Dim As Integer bpos, epos, toks
    Dim As String tok
 
    Redim array(toks)
 
    Do While Strptr(text)
        epos = Instr(bpos + 1, text, wrapchar)
        array(toks) = Mid(text, bpos + 1, epos - bpos - 1)
        If epos = FALSE Then Exit Do      
        toks += 1
        Redim Preserve array(toks)
        bpos = epos
    Loop
End Sub
 
' from https://rosettacode.org/wiki/Word_wrap#FreeBASIC translated
' very rough wordwrap results in poor readable text
Dim Shared As String array()
function wordwrap(text As String, n As Integer) as string
    Split(array(), text, " ")
    Dim lineitem As String = ""
    dim temptxt as string = ""
    For i As Integer = 0 To Ubound(array)
        If Len(lineitem) = 0 Then
            lineitem = lineitem & array(i)
        Elseif Len(lineitem & " " & array(i)) <= n Then
            lineitem = lineitem & " " & array(i)
        Else
            ' special case no space in line chop in two
            if len(lineitem) > n then
                lineitem = mid(lineitem, 1, fix(n / 1.2)) + chr$(13) + chr$(10)_
                         + mid(lineitem, fix(n / 1.2), len(lineitem))
            end if        
            temptxt = temptxt + lineitem + chr$(13) + chr$(10)
            lineitem = array(i)
        End If
   Next i

    If Len(lineitem) > 0 Then
        temptxt = temptxt + lineitem + chr$(13) + chr$(10)  
        return temptxt
    end if

End function

' decode a base64 encoded file
if filename <> "" then
    if mkdir(tempfolder) < 0  then
        print "error: could not create folder " + tempfolder
    end if
    msg64 = ""
    textitem = ""
    select case texttype
        case "-txt"
            textfile = tempfolder + "\" + mid(command(2), instrrev(filename, "\"), instrrev(command(2), ".") - 1) + ".txt"
        case "-html"
            textfile = tempfolder + "\" + mid(command(2), instrrev(filename, "\"), instrrev(command(2), ".") - 1) + ".html"
    end select
    Open filename For input As 1
    open textfile for output as 3    
    Do Until EOF(1)
        ' stop decoding
        Line Input #1, listitem
        ' special case remove %2520 used in filenames images
        listitem = Replace(listitem, "%2520", "")
        ' filter out mht header for html
        select case texttype
            case "-txt"
                'nop
            case "-html"
                if instr(listitem, "<html") = 0 and chkhtml = false then
                    listitem = ""
                else
                    chkhtml = true
                end if
        end select
        if instr(listitem, "------=_NextPart") > 0 then
            Print #2, Decode64(msg64)
            chkcontenttype = false
            msg64 = ""
            close (2)
        end if
        ' start decoding
        select case true
            case instr(listitem, "Content-Type: image") > 0
                chkcontenttype = true
            case instr(listitem, "Content-Type: text/javascript") > 0
                chkcontenttype = true
            case instr(listitem, "Content-Type: text/css") > 0
                chkcontenttype = true
            case instr(listitem, "Content-Type: font") > 0
                chkcontenttype = true
        end select
        if chkcontenttype then
            if instr(listitem, "Content-Location:") > 0 then
                ' output decoded images to a temp dir
                open tempfolder + "\" + mid(listitem, instrrev(listitem, "/") + 1) for output as 2
            end if
            ' ghetto validation base64
            select case true
                case instr(listitem, " ") > 0
                    'nop
                case instr(listitem, "-") > 0
                    'nop
                case instr(listitem, ":") > 0
                    'nop
                case instr(listitem, "%") > 0
                    'nop
                case len(listitem) = 0
                    'nop
                case else
                    msg64 = msg64 + listitem
            end select
        end if
        if chkcontenttype = false then
            select case true
                case instr(listitem, "------=_NextPart") > 0
                    listitem = ""
                case instr(listitem, "Content-Type:") > 0
                    listitem = ""
                case instr(listitem, "Content-Transfer-Encoding:") > 0
                    listitem = ""
                case instr(listitem, "Content-Location:") > 0
                    listitem = ""
            end select
            ' special cases mht
            ' remove frontpage thing sticks = to end of line
            if mid(listitem, len(listitem)) = "=" then
                listitem = mid(listitem, 1, len(listitem) - 1)
            end if
            select case texttype
                case "-txt"
                    IF LEN(listitem) > 1 then
                        textitem = textitem + trim(listitem)
                    end if
                case "-html"
                    textitem = textitem + listitem
            end select
        end if    
        itemnr += 1
    Loop
    ' generic replace for text and html
    textitem = Replace(textitem, "  ", "")
    textitem = Replace(textitem, "=A0", " ")
    textitem = Replace(textitem, "=20", " ")
    textitem = Replace(textitem, "=3D", "=")
    textitem = Replace(textitem, "=09", " ")
    textitem = Replace(textitem, "=C2", " ")
    textitem = Replace(textitem, "=F6", "")
    textitem = Replace(textitem, "=E2=80=93", "-")
    textitem = replace(textitem, "=E2=80=99", "")
    textitem = replace(textitem, chr$(9), "")
    select case texttype
        case "-txt"
            textitem = striphtmltags(textitem)
            textitem = Replace(textitem, "  ", "")
            textitem = wordwrap(textitem, linelength)
        case "-html"
            textitem = Replace(textitem, "=A0", " ")
            textitem = Replace(textitem, "=20", " ")
            print "nr image anchors changed: " & replaceimageanchor(textitem, chr$(34) + "file:///")
    end select

    print #3, textitem
    close
end if    

end

Post Reply