class Bio::FastaFormat
Treats a FASTA formatted entry, such as:
>id and/or some comments <== definition line ATGCATGCATGCATGCATGCATGCATGCATGCATGC <== sequence lines ATGCATGCATGCATGCATGCATGCATGCATGCATGC ATGCATGCATGC
The precedent ‘>’ can be omitted and the trailing ‘>’ will be removed automatically.
Examples¶ ↑
fasta_string = <<END_OF_STRING >gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c] MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI VRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ NLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP IFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP INRISARRAAIHPYFQES END_OF_STRING f = Bio::FastaFormat.new(fasta_string) f.entry #=> ">gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]\n"+ # MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI\n"+ # VRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ\n"+ # NLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP\n"+ # IFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP\n"+ # INRISARRAAIHPYFQES"
Methods related to the name of the sequence¶ ↑
A larger range of methods for dealing with Fasta
definition lines can be found in FastaDefline
, accessed through the FastaFormat#identifiers
method.
f.entry_id #=> "gi|398365175" f.first_name #=> "gi|398365175|ref|NP_009718.3|" f.definition #=> "gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]" f.identifiers #=> Bio::FastaDefline instance f.accession #=> "NP_009718" f.accessions #=> ["NP_009718"] f.acc_version #=> "NP_009718.3" f.comment #=> nil
Methods related to the actual sequence¶ ↑
f.seq #=> "MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES" f.data #=> "\nMSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI\nVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ\nNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP\nIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP\nINRISARRAAIHPYFQES\n" f.length #=> 298 f.aaseq #=> "MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES" f.aaseq.composition #=> {"M"=>5, "S"=>15, "G"=>21, "E"=>16, "L"=>36, "A"=>17, "N"=>8, "Y"=>13, "K"=>22, "R"=>20, "V"=>18, "T"=>7, "D"=>23, "P"=>17, "Q"=>10, "I"=>23, "H"=>7, "F"=>12, "C"=>4, "W"=>4} f.aalen #=> 298
A less structured fasta entry¶ ↑
f.entry #=> ">abc 123 456\nASDF" f.entry_id #=> "abc" f.first_name #=> "abc" f.definition #=> "abc 123 456" f.comment #=> nil f.accession #=> nil f.accessions #=> [] f.acc_version #=> nil f.seq #=> "ASDF" f.data #=> "\nASDF\n" f.length #=> 4 f.aaseq #=> "ASDF" f.aaseq.composition #=> {"A"=>1, "S"=>1, "D"=>1, "F"=>1} f.aalen #=> 4
References
¶ ↑
-
FASTA format (WikiPedia) en.wikipedia.org/wiki/FASTA_format
Constants
- DELIMITER
Entry delimiter in flatfile text.
- DELIMITER_OVERRUN
(Integer) excess read size included in
DELIMITER
.
Attributes
The seuqnce lines in text.
The comment line of the FASTA formatted data.
Public Class Methods
Stores the comment and sequence information from one entry of the FASTA format string. If the argument contains more than one entry, only the first entry is used.
# File lib/bio/db/fasta.rb 133 def initialize(str) 134 @definition = str[/.*/].sub(/^>/, '').strip # 1st line 135 @data = str.sub(/.*/, '') # rests 136 @data.sub!(/^>.*/m, '') # remove trailing entries for sure 137 @entry_overrun = $& 138 end
Public Instance Methods
Returens the length of Bio::Sequence::AA
.
# File lib/bio/db/fasta.rb 223 def aalen 224 self.aaseq.length 225 end
Returens the Bio::Sequence::AA
.
# File lib/bio/db/fasta.rb 218 def aaseq 219 Sequence::AA.new(seq) 220 end
Returns accession number with version.
# File lib/bio/db/fasta.rb 279 def acc_version 280 identifiers.acc_version 281 end
Returns an accession number.
# File lib/bio/db/fasta.rb 267 def accession 268 identifiers.accession 269 end
Parsing FASTA Defline (using identifiers
method), and shows accession numbers. It returns an array of strings.
# File lib/bio/db/fasta.rb 274 def accessions 275 identifiers.accessions 276 end
Returns comments.
# File lib/bio/db/fasta.rb 197 def comment 198 seq 199 @comment 200 end
Returns the stored one entry as a FASTA format. (same as to_s
)
# File lib/bio/db/fasta.rb 141 def entry 142 @entry = ">#{@definition}\n#{@data.strip}\n" 143 end
Parsing FASTA Defline (using identifiers
method), and shows a possibly unique identifier. It returns a string.
# File lib/bio/db/fasta.rb 253 def entry_id 254 identifiers.entry_id 255 end
Returns the first name (word) of the definition line - everything before the first whitespace.
>abc def #=> 'abc' >gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c] #=> 'gi|398365175|ref|NP_009718.3|' >abc #=> 'abc'
# File lib/bio/db/fasta.rb 294 def first_name 295 index = definition.index(/\s/) 296 if index.nil? 297 return @definition 298 else 299 return @definition[0...index] 300 end 301 end
Parsing FASTA Defline (using identifiers
method), and shows GI/locus/accession/accession with version number. If a entry has more than two of such IDs, only the first ID are shown. It returns a string or nil.
# File lib/bio/db/fasta.rb 262 def gi 263 identifiers.gi 264 end
Parsing FASTA Defline, and extract IDs. IDs are NSIDs (NCBI
standard FASTA sequence identifiers) or “:”-separated IDs. It returns a Bio::FastaDefline
instance.
# File lib/bio/db/fasta.rb 243 def identifiers 244 unless defined?(@ids) then 245 @ids = FastaDefline.new(@definition) 246 end 247 @ids 248 end
Returns sequence length.
# File lib/bio/db/fasta.rb 203 def length 204 seq.length 205 end
Returns locus.
# File lib/bio/db/fasta.rb 284 def locus 285 identifiers.locus 286 end
Returens the length of Bio::Sequence::NA
.
# File lib/bio/db/fasta.rb 213 def nalen 214 self.naseq.length 215 end
Returens the Bio::Sequence::NA
.
# File lib/bio/db/fasta.rb 208 def naseq 209 Sequence::NA.new(seq) 210 end
Executes FASTA/BLAST search by using a Bio::Fasta
or a Bio::Blast
factory object.
#!/usr/bin/env ruby require 'bio' factory = Bio::Fasta.local('fasta34', 'db/swissprot.f') flatfile = Bio::FlatFile.open(Bio::FastaFormat, 'queries.f') flatfile.each do |entry| p entry.definition result = entry.fasta(factory) result.each do |hit| print "#{hit.query_id} : #{hit.evalue}\t#{hit.target_id} at " p hit.lap_at end end
# File lib/bio/db/fasta.rb 164 def query(factory) 165 factory.query(entry) 166 end
Returns a joined sequence line as a String.
# File lib/bio/db/fasta.rb 171 def seq 172 unless defined?(@seq) 173 unless /\A\s*^\#/ =~ @data then 174 @seq = Sequence::Generic.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up 175 else 176 a = @data.split(/(^\#.*$)/) 177 i = 0 178 cmnt = {} 179 s = [] 180 a.each do |x| 181 if /^# ?(.*)$/ =~ x then 182 cmnt[i] ? cmnt[i] << "\n" << $1 : cmnt[i] = $1 183 else 184 x.tr!(" \t\r\n0-9", '') # lazy clean up 185 i += x.length 186 s << x 187 end 188 end 189 @comment = cmnt 190 @seq = Bio::Sequence::Generic.new(s.join('')) 191 end 192 end 193 @seq 194 end
Returns sequence as a Bio::Sequence
object.
Note: If you modify the returned Bio::Sequence
object, the sequence or definition in this FastaFormat
object might also be changed (but not always be changed) because of efficiency.
# File lib/bio/db/fasta.rb 234 def to_biosequence 235 Bio::Sequence.adapter(self, Bio::Sequence::Adapter::FastaFormat) 236 end