Code: Select all
'
'
' (c) Copyright Edward . Montague .
' 2018 .
'
'
'
' Test using Human Chromosome 1 .
'
' Data from :
'
' http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr1.fa.zip ?
'
' nBLAST , nucleotide BLAST
'
'
' Chromosome 1 is the designation for the largest human chromosome.
' Humans have two copies of chromosome 1, as they do with all of the autosomes,
' which are the non-sex chromosomes.
'
' Chromosome 1 spans about 249 million nucleotide base pairs,
' which are the basic units of information for DNA.
'
'
' Genome and nibble representation of base characters ,
' for alignment test ; there are four upper bits beyond this , use for other logic.
'
' A = 1 , C = 2 , G = 4 , T = 8
'
'
' I'm using a two stage approach to finding a sub sequence match .
'
' if statement , in for loop , doesn't affect the calculation time much .
'
'
dim as ubyte a,b,c ' ubyte = 8 bits .
Dim As Integer th,be,l,m,q, secs , mins , hrs
Dim Start As Double , Finish As Double
dim as string gs,cs,s
dim as integer gl , cl , dl , t, i,j,k
'
Dim as string filename
Dim filearr() as string
Dim as integer ub
'
'
declare sub readfile(Byref fn as string, filearr() as string)
'
declare function nib_cmp(a as ubyte,b as ubyte) as ubyte
declare function chr2bin(s as string)as ubyte
declare function bin2chr(a as ubyte)as string
'
' ------------------ main ---------------------------------------------
'
'
' A few seconds required to load this .
'
' your directory here :
'
filename ="/home/edward/Downloads/chr1.fa"
'
print
print " ---------- begin --------------------- "
print
Start = Timer
readfile(filename, filearr())
Finish=Timer
secs = Finish-Start
hrs = int(secs/3600)
mins = int((secs- 3600*hrs)/60)
secs =secs -hrs*3600-mins*60
print " read in file . "
print " Duration : ";hrs;" hrs ";mins;" mins ";secs;" seconds "
print
print filearr(1)
print filearr(709)
print filearr(909)
print
print " done 1 . "
print
'
ub=ubound(filearr)
gs=""
Start = Timer
for i=2 to ub
cs=filearr(i)
gs = gs + cs
next i
Finish=Timer
secs = Finish-Start
hrs = int(secs/3600)
mins = int((secs- 3600*hrs)/60)
secs =secs -hrs*3600-mins*60
print " produced continuos string . "
print " Duration : ";hrs;" hrs ";mins;" mins ";secs;" seconds "
print
print " done 2 . "
print
gl=len(gs)
dim ga(1 to gl) as ubyte
Start = Timer
for i=1 to gl
s= mid(gs,i,1)
a=chr2bin(s)
ga(i)=a
next i
'
Finish=Timer
secs = Finish-Start
hrs = int(secs/3600)
mins = int((secs- 3600*hrs)/60)
secs =secs -hrs*3600-mins*60
print
print " converted to unique binary representation array. "
print " Duration : ";hrs;" hrs ";mins;" mins ";secs;" seconds "
print
print " done 3 . "
print
'
' ----------- sub sequence , binary coded ---------------------
'
' Your sequence to search for here :
' Will require conversion from chr2bin , if in usual format .
'
' Presently I'm just using a known sub sequence from binary coded chr1 .
'
dl=8000
dim gd(1 to dl) as ubyte
'
for i=1 to dl
a=ga(i+35000)
gd(i)=a
next i
'
'
print " gd(4) : ";gd(4)
print
'
' sub sub sequence , binary coded .
'
cl=80
cl=16
dim gc(1 to cl) as ubyte
'
for i=1 to cl
gc(i)=gd(i)
next i
'
dim gn(1 to gl)as integer
'
print
print " sequence length : ";gl
print " sub sequence length : ";dl
print " sub sub sequence length : ";cl
print
print " sub sub sequence is : AGCAGCTTGGTGCCTC "
for i=1 to cl
a = gc(i)
s = bin2chr(a )
print s;
next i
print
print " done 4 . "
print
'
' ...............................................
'
'
' Number of base errors permissable , be .
'
be=0
th=cl-be
'
' Main loops .
'
' sub sub sequence search and compare .
'
m=1
Start = Timer
for i=1 to gl-cl
t=0
k=1
for j=i to i+cl
a=ga(j)
b=gc(k)
c= nib_cmp(a ,b )
t=t+c
k=k+1
next j
if (t = th) then
gn(m) = i
m = m + 1
else
t=0
end if
next i
Finish=Timer
secs = Finish-Start
hrs = int(secs/3600)
mins = int((secs- 3600*hrs)/60)
secs =secs -hrs*3600-mins*60
print " first stage complete "
print
print " Duration : ";hrs;" hrs ";mins;" mins ";secs;" seconds "
print
print " Number of matches : ";m-1
print " first match @ : ";gn(1);" ";ga(gn(1))
print " second match @ : ";gn(2);" ";ga(gn(2))
print " third match @ : ";gn(3);" ";ga(gn(3))
print
'
' 1 35001
'
' 2 205517
'
' Now using complete sub sequence .
'
dim gp(1 to m) as integer
print
print " dl = ";dl
print
th=dl-be
q=1
Start = Timer
for i=1 to m-1
t=0
k=1
l=gn(i)
for j=l to l+dl-1
a=ga(j)
b=gd(k)
c= nib_cmp(a ,b )
t=t+c
k=k+1
next j
if (t = th) then
gp(q) = l
q = q + 1
else
t=0
end if
next i
Finish=Timer
secs = Finish-Start
hrs = int(secs/3600)
mins = int((secs- 3600*hrs)/60)
secs =secs -hrs*3600-mins*60
print " second stage complete "
print
print " Duration : ";hrs;" hrs ";mins;" mins ";secs;" seconds "
print
print " Number of matches : ";q-1
print " first match @ : ";gp(1);" ";ga(gp(1))
print " second match @ : ";gp(2);" ";ga(gp(2))
print " third match @ : ";gp(3)
'
' 1 35001
'
print
print " done 5 fin "
Sleep
end
'
' ============================== fin ===================================
'
'
sub readfile(Byref fn as string, filearr() as string)
'
' This also ignores comment lines beginning with '#'.
' Usage:
' Dim as string filename
' Dim filearr() as string
' readfile(filename, filearr())
'
dim as integer filenum,res,outpos,i,ub
dim as string procname,ln
procname="readfile"
'print procname;": Reading ";fn
' Check if fn exists.
if (len(fn)=0) then
print procname;" ERROR: File name was blank "
end
endif
if dir(fn)="" then ' Does file exist?
print procname;" ERROR: File doesn't exist: ";fn;". Check your spelling."
end
endif
outpos=1
filenum=Freefile
res=open (fn, for Input, as #filenum)
if res > 0 then print procname;" ERROR opening ";fn:end
while (not eof(filenum))
line input #filenum, ln ' Get one whole text line
ln=trim(ln)
if (left(ln,1) <> "#") and (len(ln)>0) then
redim preserve filearr(outpos)
filearr(outpos)=ln
'print procname;": Read line: ";ln
outpos=outpos+1
endif
wend
close #filenum
ub=ubound(filearr) ' number of lines .
print procname;": ub=";ub
end sub ' readfile()
'
' ------------------------------------------------------------------
'
function nib_cmp(a as ubyte,b as ubyte) as ubyte
'
' Compare two nibbles via and operator .
'
static c as ubyte
c=0
if (a and b)>0 then c=1 else c=c
return c
end function
'
' ......................................
'
function chr2bin(s as string)as ubyte
'
' Character to unique nibble representation .
'
static c as ubyte
c = 0
s=ucase(s)
if ( s = "A" ) then c =1 else c=c
if ( s = "C" ) then c =2 else c=c
if ( s = "G" ) then c =4 else c=c
if ( s = "T" ) then c =8 else c=c
return (c)
end function
'
' ......................................
'
function bin2chr(a as ubyte)as string
'
' Unique nibble representation to character.
'
static s as string
s = ""
if ( a = 1 ) then s ="A" else s=s
if ( a = 2 ) then s ="C" else s=s
if ( a = 4 ) then s ="G" else s=s
if ( a = 8 ) then s ="T" else s=s
return (s)
end function
'
' ----------------------------------------------------------------------
'