nBLAST in fb

Post your FreeBASIC source, examples, tips and tricks here. Please don’t post code without including an explanation.
Post Reply
Luxan
Posts: 222
Joined: Feb 18, 2009 12:47
Location: New Zealand

nBLAST in fb

Post by Luxan »

Code: Select all

'
'
'  (c) Copyright Edward . Montague . 
'       2018  .
'
'
'
'   Test using Human Chromosome 1 .
'
'  Data from :
'
'  http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr1.fa.zip ?
'
'   nBLAST  ,  nucleotide BLAST
'
'
'  Chromosome 1 is the designation for the largest human chromosome.
' Humans have two copies of chromosome 1, as they do with all of the autosomes,
' which are the non-sex chromosomes.
'
'  Chromosome 1 spans about 249 million nucleotide base pairs, 
' which are the basic units of information for DNA.
'
'
'  Genome and nibble representation of base characters ,
' for alignment test ; there are four upper bits beyond this , use for other logic.  
'
'   A = 1 , C = 2 , G = 4 , T = 8
'
'
'  I'm using a two stage approach to finding a sub sequence match .
'
'  if statement , in for loop , doesn't affect the calculation time much .
'
'
dim as ubyte a,b,c ' ubyte = 8 bits .
Dim As Integer th,be,l,m,q, secs , mins , hrs
Dim Start As Double , Finish As Double
dim as string gs,cs,s
dim as integer gl , cl , dl , t, i,j,k
'
Dim as string filename
Dim filearr() as string
Dim as integer ub
'
'
declare sub readfile(Byref fn as string,  filearr() as string)
'
declare function nib_cmp(a as ubyte,b as ubyte) as ubyte
declare function chr2bin(s as string)as ubyte
declare function bin2chr(a as ubyte)as string
'
' ------------------ main ---------------------------------------------
'
'
'  A few seconds required to load this .
'
'  your directory here :
'
filename ="/home/edward/Downloads/chr1.fa"

'
print
print " ---------- begin --------------------- "
print
Start = Timer
readfile(filename, filearr())
Finish=Timer
secs = Finish-Start
hrs = int(secs/3600)
mins = int((secs- 3600*hrs)/60)
secs =secs -hrs*3600-mins*60
print " read in file . "
print " Duration : ";hrs;" hrs ";mins;" mins ";secs;" seconds "
print
print filearr(1)
print filearr(709)
print filearr(909)
print
print " done 1  . "
print
'
ub=ubound(filearr)
gs=""
Start = Timer
for i=2 to ub
  cs=filearr(i)
  gs = gs + cs
next i
Finish=Timer
secs = Finish-Start
hrs = int(secs/3600)
mins = int((secs- 3600*hrs)/60)
secs =secs -hrs*3600-mins*60
print " produced continuos string . "
print " Duration : ";hrs;" hrs ";mins;" mins ";secs;" seconds "
print
print " done 2  . "
print
gl=len(gs)
dim ga(1 to gl) as ubyte
Start = Timer
for i=1 to gl
    s= mid(gs,i,1)
    a=chr2bin(s)
    ga(i)=a
next i
'
Finish=Timer
secs = Finish-Start
hrs = int(secs/3600)
mins = int((secs- 3600*hrs)/60)
secs =secs -hrs*3600-mins*60
print
print " converted to unique binary representation array. "
print " Duration : ";hrs;" hrs ";mins;" mins ";secs;" seconds "
print
print " done 3  . "
print
'
' ----------- sub sequence , binary coded ---------------------
'
'  Your sequence to search for here :
'  Will require conversion from chr2bin , if in usual format .
'
'  Presently I'm just using a known sub sequence from binary coded chr1  .
'
dl=8000

dim gd(1 to dl) as ubyte
'
for i=1 to dl
    a=ga(i+35000)
    gd(i)=a
next i
'
'
print " gd(4) : ";gd(4)
print
'
'                           sub sub sequence , binary coded .
'
cl=80
cl=16
dim gc(1 to cl) as ubyte
'
for i=1 to cl
    gc(i)=gd(i)
next i
'
dim gn(1 to gl)as integer
'
print
print " sequence length         : ";gl
print " sub sequence length     : ";dl 
print " sub sub sequence length : ";cl
print
print " sub sub sequence is : AGCAGCTTGGTGCCTC "
for i=1 to cl
   a = gc(i)
   s = bin2chr(a )
   print s; 
next i

print
print " done 4 . "
print
'
' ...............................................
'
'
'  Number of base errors permissable , be .
'
be=0
th=cl-be
'
'  Main loops .
'
'  sub  sub  sequence search and compare .
'
m=1

Start = Timer
for i=1 to gl-cl
    t=0
    k=1
    
 for j=i to i+cl
    a=ga(j)
    b=gc(k) 
    c= nib_cmp(a ,b )
    t=t+c
    k=k+1
  next j
  
  if (t = th) then 
        gn(m) = i
        m = m + 1
  else 
        t=0
  end if  
 next i
Finish=Timer
secs = Finish-Start
hrs = int(secs/3600)
mins = int((secs- 3600*hrs)/60)
secs =secs -hrs*3600-mins*60
print "  first stage complete "
print
print " Duration : ";hrs;" hrs ";mins;" mins ";secs;" seconds "
print
print " Number of matches : ";m-1
print " first match @     : ";gn(1);" ";ga(gn(1))
print " second match @    : ";gn(2);" ";ga(gn(2))
print " third match @     : ";gn(3);" ";ga(gn(3))
print
'
'  1  35001
'
'  2  205517
'
' Now using complete sub sequence .
'
dim gp(1 to m) as integer
print
print " dl = ";dl
print
th=dl-be
q=1
Start = Timer
for i=1 to m-1
    t=0
    k=1
    l=gn(i)

 for j=l to l+dl-1
    a=ga(j)
    b=gd(k) 
    c= nib_cmp(a ,b )
    t=t+c
    k=k+1
  next j
  
  if (t = th) then 
        gp(q) = l
        q = q + 1
  else 
        t=0
  end if  
 next i
Finish=Timer
secs = Finish-Start
hrs = int(secs/3600)
mins = int((secs- 3600*hrs)/60)
secs =secs -hrs*3600-mins*60
print " second stage complete " 
print
print " Duration : ";hrs;" hrs ";mins;" mins ";secs;" seconds "
print
print " Number of matches : ";q-1
print " first match @ : ";gp(1);"  ";ga(gp(1))
print " second match @ : ";gp(2);"  ";ga(gp(2))
print " third match @ : ";gp(3)

'
'  1  35001
'
print
print " done 5    fin    "

Sleep
end
'
' ============================== fin ===================================
'
'
sub readfile(Byref fn as string,  filearr() as string)
'
' This also ignores comment lines beginning with '#'.
' Usage:
' Dim as string filename
' Dim filearr() as string
' readfile(filename, filearr())
'
dim as integer filenum,res,outpos,i,ub
dim as string procname,ln

procname="readfile"

'print procname;": Reading ";fn
' Check if fn exists.
if (len(fn)=0) then
    print procname;" ERROR: File name was blank "
    end
endif
if dir(fn)="" then ' Does file exist?
    print procname;" ERROR: File doesn't exist: ";fn;". Check your spelling."
    end
endif

outpos=1
filenum=Freefile
res=open (fn, for Input, as #filenum)
if res > 0 then print procname;" ERROR opening ";fn:end
while (not eof(filenum))
    line input #filenum, ln ' Get one whole text line
    ln=trim(ln)
    if (left(ln,1) <> "#") and (len(ln)>0) then
        redim preserve filearr(outpos)
        filearr(outpos)=ln
        'print procname;": Read line: ";ln
        outpos=outpos+1
    endif
wend

close #filenum
ub=ubound(filearr) ' number of lines .
print procname;": ub=";ub

end sub ' readfile()
'
' ------------------------------------------------------------------
'
function nib_cmp(a as ubyte,b as ubyte) as ubyte
'
'   Compare two nibbles via and operator .
'
static c as ubyte
        c=0
       if (a and b)>0 then c=1 else c=c
        return c
       

end function
'
' ......................................
'
function chr2bin(s as string)as ubyte
'
'  Character to unique nibble representation .
'
static c as ubyte
         c = 0
         s=ucase(s)
    if ( s = "A" ) then c =1 else c=c
    if ( s = "C" ) then c =2 else c=c
    if ( s = "G" ) then c =4 else c=c
    if ( s = "T" ) then c =8 else c=c
    
    return (c)       

end function
'
' ......................................
'
function bin2chr(a as ubyte)as string
'
'  Unique nibble representation to character.
'
static s as string
         s = ""
    if ( a = 1 ) then s ="A" else s=s
    if ( a = 2 ) then s ="C" else s=s
    if ( a = 4 ) then s ="G" else s=s
    if ( a = 8 ) then s ="T" else s=s
    
    return (s)       

end function
'
' ----------------------------------------------------------------------
'












srvaldez
Posts: 3379
Joined: Sep 25, 2005 21:54

Re: nBLAST in fb

Post by srvaldez »

I was expecting some kind of asteroid blast game
the address for the data file needs a gz extension, not zip, that is http://hgdownload.soe.ucsc.edu/goldenPa ... chr1.fa.gz
Tourist Trap
Posts: 2958
Joined: Jun 02, 2015 16:24

Re: nBLAST in fb

Post by Tourist Trap »

srvaldez wrote:I was expecting some kind of asteroid blast game
It's more ground rooted unfortunately!
Wikipedia:
https://en.wikipedia.org/wiki/BLAST
In bioinformatics, BLAST (basic local alignment search tool) is an algorithm for comparing primary biological sequence information, such as the amino-acid sequences of proteins or the nucleotides of DNA and/or RNA sequences. A BLAST search enables a researcher to compare a query sequence with a library or database of sequences, and identify library sequences that resemble the query sequence above a certain threshold.
Luxan
Posts: 222
Joined: Feb 18, 2009 12:47
Location: New Zealand

Re: nBLAST in fb

Post by Luxan »

I know , looks a bit boring .
Ground rooted , well even astronauts have dna .

FreeBASIC is one of a few programming languages that now enables me to handle large files like the human genome with a fairly clean syntax .

There have been a few games created with a biological theme .

Though I'm not an great fan of Star Trek , some of their medical bay props looked
rather interesting ; perhaps worth emulating in a game .

Anyway for others that are following this thread , here's an update.

Code: Select all


'
'
'  (c) Copyright Edward . Montague .
'
'       luxan
'
'       2019  .
'
'
'
'   Test using Human Chromosome 1 .
'
'  Data from :
'
'  http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr1.fa.zip ?
'
'   nBLAST  ,  nucleotide BLAST
'
'
'  Chromosome 1 is the designation for the largest human chromosome.
' Humans have two copies of chromosome 1, as they do with all of the autosomes,
' which are the non-sex chromosomes.
'
'  Chromosome 1 spans about 249 million nucleotide base pairs,
' which are the basic units of information for DNA.
'
'
'  Genome and nibble representation of base characters ,
' for alignment test ; there are four upper bits beyond this , use for other logic. 
'
'   A = 1 , C = 2 , G = 4 , T = 8
'
'
'  I'm using a two stage approach to finding a sub sequence match .
'
'  if statement , in for loop , doesn't affect the calculation time much .
'
'  Now using two base error  , be ,  criteria ; be1 for first comparison
' and be2 for second . The total base error being  , be = be1 + be2 .
'
'  The actual base match location might require correction by subtracting a value of one
' from the found sequence . We don't want to inadvertedly introduce frame shifts
' to our results .
'
'
'
dim as ubyte a,b,c ' ubyte = 8 bits .
Dim As Integer th,be,l,m,q, secs , mins , hrs
Dim Start As Double , Finish As Double
dim as string gs,cs,s
dim as integer gl , cl , dl , t, i,j,k
'
Dim as string filename
Dim filearr() as string
Dim as integer ub
'
'
declare sub readfile(Byref fn as string,  filearr() as string)
'
declare function nib_cmp(a as ubyte,b as ubyte) as ubyte
declare function chr2bin(s as string)as ubyte
declare function bin2chr(a as ubyte)as string
'
' ------------------ main ---------------------------------------------
'
'
'  A few seconds required to load this .
'
'  your directory here :
'
filename ="/home/edward/Downloads/chr1.fa"

'
print
print " ---------- begin --------------------- "
print
Start = Timer
readfile(filename, filearr())
Finish=Timer
print " read in file . "
timez(start,finish)
print filearr(1)
print filearr(709)
print filearr(909)
print
print " done 1  . "
print
'
ub=ubound(filearr)
gs=""
Start = Timer
for i=2 to ub
  cs=filearr(i)
  gs = gs + cs
next i
Finish=Timer
print " produced continuos string . "
timez(start,finish)
print " done 2  . "
print
gl=len(gs)
dim ga(1 to gl) as ubyte
Start = Timer
for i=1 to gl
    s= mid(gs,i,1)
    a=chr2bin(s)
    ga(i)=a
next i
Finish=Timer
print " converted to unique binary representation array. "
timez(start,finish)
print " done 3  . "
print
'
' ----------- sub sequence , binary coded ---------------------
'
'  Your sequence to search for here :
'  Will require conversion from chr2bin , if in usual format .
'
'  Presently I'm just using a known sub sequence from binary coded chr1  .
'
dl=8000

dim gd(1 to dl) as ubyte
'
for i=1 to dl
    a=ga(i+35000)
    gd(i)=a
next i
'
'
print " gd(4) : ";gd(4)
print
'
'                           sub sub sequence , binary coded .
'
cl=80
cl=16
dim gc(1 to cl) as ubyte
'
for i=1 to cl
    gc(i)=gd(i)
next i
'
dim gn(1 to gl)as integer
'
print
print " sequence length         : ";gl
print " sub sequence length     : ";dl
print " sub sub sequence length : ";cl
print
print " sub sub sequence is : AGCAGCTTGGTGCCTC "
for i=1 to cl
   a = gc(i)
   s = bin2chr(a )
   print s;
next i

print
print " done 4 . "
print
'
' ...............................................
'
'
'  Number of base errors permissable , be .
'
be1=0
be2=0


be=be1 + be2
th=cl-be1
'
'  Main loops .
'
'  sub  sub  sequence search and compare .
'
m=1

Start = Timer
for i=1 to gl-cl
    t=0
    k=1
   
 for j=i to i+cl
    a=ga(j)
    b=gc(k)
    c= nib_cmp(a ,b )
    t=t+c
    k=k+1
  next j
 
  if (t = th) then
        gn(m) = i
        m = m + 1
  else
        t=0
  end if 
 next i
Finish=Timer
print "  first stage complete "
timez(start,finish)
print " Number of matches : ";m-1
print " first match @     : ";gn(1);" ";ga(gn(1))
print " second match @    : ";gn(2);" ";ga(gn(2))
print " third match @     : ";gn(3);" ";ga(gn(3))
print
'
'  1  35001 , maybe subtract one from this ??
'
'  2  205517 , maybe subtract one from this ??
'
' Now using complete sub sequence .
'
dim gp(1 to m) as integer
print
print " dl = ";dl
print
th=dl-be2
q=1
Start = Timer
for i=1 to m-1
    t=0
    k=1
    l=gn(i)

 for j=l to l+dl-1
    a=ga(j)
    b=gd(k)
    c= nib_cmp(a ,b )
    t=t+c
    k=k+1
  next j
 
  if (t = th) then
        gp(q) = l
        q = q + 1
  else
        t=0
  end if 
 next i
Finish=Timer
print " second stage complete "
timez(start,finish)
print " Number of matches : ";q-1
print " first match @ : ";gp(1);"  ";ga(gp(1))
print " second match @ : ";gp(2);"  ";ga(gp(2))
print " third match @ : ";gp(3)

'
'  1  35001   ,  should be 35000 therefore subtract one ?
'
print
print " done 5    fin    "

Sleep
end
'
' ============================== fin ===================================
'
'
sub readfile(Byref fn as string,  filearr() as string)
'
' This also ignores comment lines beginning with '#'.
' Usage:
' Dim as string filename
' Dim filearr() as string
' readfile(filename, filearr())
'
dim as integer filenum,res,outpos,i,ub
dim as string procname,ln

procname="readfile"

'print procname;": Reading ";fn
' Check if fn exists.
if (len(fn)=0) then
    print procname;" ERROR: File name was blank "
    end
endif
if dir(fn)="" then ' Does file exist?
    print procname;" ERROR: File doesn't exist: ";fn;". Check your spelling."
    end
endif

outpos=1
filenum=Freefile
res=open (fn, for Input, as #filenum)
if res > 0 then print procname;" ERROR opening ";fn:end
while (not eof(filenum))
    line input #filenum, ln ' Get one whole text line
    ln=trim(ln)
    if (left(ln,1) <> "#") and (len(ln)>0) then
        redim preserve filearr(outpos)
        filearr(outpos)=ln
        'print procname;": Read line: ";ln
        outpos=outpos+1
    endif
wend

close #filenum
ub=ubound(filearr) ' number of lines .
print procname;": ub=";ub

end sub ' readfile()
'
' ------------------------------------------------------------------
'
function nib_cmp(a as ubyte,b as ubyte) as ubyte
'
'   Compare two nibbles via and operator .
'
static c as ubyte
        c=0
       if (a and b)>0 then c=1 else c=c
        return c
       

end function
'
' ......................................
'
function chr2bin(s as string)as ubyte
'
'  Character to unique nibble representation .
'
static c as ubyte
         c = 0
         s=ucase(s)
    if ( s = "A" ) then c =1 else c=c
    if ( s = "C" ) then c =2 else c=c
    if ( s = "G" ) then c =4 else c=c
    if ( s = "T" ) then c =8 else c=c
   
    return (c)       

end function
'
' ......................................
'
function bin2chr(a as ubyte)as string
'
'  Unique nibble representation to character.
'
static s as string
         s = ""
    if ( a = 1 ) then s ="A" else s=s
    if ( a = 2 ) then s ="C" else s=s
    if ( a = 4 ) then s ="G" else s=s
    if ( a = 8 ) then s ="T" else s=s
   
    return (s)       

end function
'
' ----------------------------------------------------------------------
'
sub timez(start,finish)

  secs = Finish-Start
  hrs = int(secs/3600)
  mins = int((secs- 3600*hrs)/60)
  secs =secs -hrs*3600-mins*60
print
print " Duration : ";hrs;" hrs ";mins;" mins ";secs;" seconds "
print
 


end sub
'
' -------------------------------------------------------------------------------------
'



Post Reply