class Bio::FastaFormat

Treats a FASTA formatted entry, such as:

>id and/or some comments                    <== definition line
ATGCATGCATGCATGCATGCATGCATGCATGCATGC        <== sequence lines
ATGCATGCATGCATGCATGCATGCATGCATGCATGC
ATGCATGCATGC

The precedent '>' can be omitted and the trailing '>' will be removed automatically.

Examples

fasta_string = <<END_OF_STRING
>gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]
MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI
VRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ
NLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP
IFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP
INRISARRAAIHPYFQES
END_OF_STRING

f = Bio::FastaFormat.new(fasta_string)

f.entry #=> ">gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]\n"+
# MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI\n"+
# VRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ\n"+
# NLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP\n"+
# IFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP\n"+
# INRISARRAAIHPYFQES"

Methods related to the name of the sequence

A larger range of methods for dealing with Fasta definition lines can be found in FastaDefline, accessed through the FastaFormat#identifiers method.

f.entry_id #=> "gi|398365175"
f.first_name #=> "gi|398365175|ref|NP_009718.3|"
f.definition #=> "gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]"
f.identifiers #=> Bio::FastaDefline instance
f.accession #=> "NP_009718"
f.accessions #=> ["NP_009718"]
f.acc_version #=> "NP_009718.3"
f.comment #=> nil

Methods related to the actual sequence

f.seq #=> "MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES"
f.data #=> "\nMSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI\nVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ\nNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP\nIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP\nINRISARRAAIHPYFQES\n"
f.length #=> 298
f.aaseq #=> "MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES"
f.aaseq.composition #=> {"M"=>5, "S"=>15, "G"=>21, "E"=>16, "L"=>36, "A"=>17, "N"=>8, "Y"=>13, "K"=>22, "R"=>20, "V"=>18, "T"=>7, "D"=>23, "P"=>17, "Q"=>10, "I"=>23, "H"=>7, "F"=>12, "C"=>4, "W"=>4}
f.aalen #=> 298

A less structured fasta entry

f.entry #=> ">abc 123 456\nASDF"

f.entry_id #=> "abc"
f.first_name #=> "abc"
f.definition #=> "abc 123 456"
f.comment #=> nil
f.accession #=> nil
f.accessions #=> []
f.acc_version #=> nil

f.seq #=> "ASDF"
f.data #=> "\nASDF\n"
f.length #=> 4
f.aaseq #=> "ASDF"
f.aaseq.composition #=> {"A"=>1, "S"=>1, "D"=>1, "F"=>1}
f.aalen #=> 4

References

Constants

DELIMITER

Entry delimiter in flatfile text.

DELIMITER_OVERRUN

(Integer) excess read size included in DELIMITER.

Attributes

data[RW]

The seuqnce lines in text.

definition[RW]

The comment line of the FASTA formatted data.

entry_overrun[R]

Public Class Methods

new(str) click to toggle source

Stores the comment and sequence information from one entry of the FASTA format string. If the argument contains more than one entry, only the first entry is used.

    # File lib/bio/db/fasta.rb
133 def initialize(str)
134   @definition = str[/.*/].sub(/^>/, '').strip       # 1st line
135   @data = str.sub(/.*/, '')                         # rests
136   @data.sub!(/^>.*/m, '')   # remove trailing entries for sure
137   @entry_overrun = $&
138 end

Public Instance Methods

aalen() click to toggle source

Returens the length of Bio::Sequence::AA.

    # File lib/bio/db/fasta.rb
223 def aalen
224   self.aaseq.length
225 end
aaseq() click to toggle source

Returens the Bio::Sequence::AA.

    # File lib/bio/db/fasta.rb
218 def aaseq
219   Sequence::AA.new(seq)
220 end
acc_version() click to toggle source

Returns accession number with version.

    # File lib/bio/db/fasta.rb
279 def acc_version
280   identifiers.acc_version
281 end
accession() click to toggle source

Returns an accession number.

    # File lib/bio/db/fasta.rb
267 def accession
268   identifiers.accession
269 end
accessions() click to toggle source

Parsing FASTA Defline (using identifiers method), and shows accession numbers. It returns an array of strings.

    # File lib/bio/db/fasta.rb
274 def accessions
275   identifiers.accessions
276 end
blast(factory)
Alias for: query
comment() click to toggle source

Returns comments.

    # File lib/bio/db/fasta.rb
197 def comment
198   seq
199   @comment
200 end
entry() click to toggle source

Returns the stored one entry as a FASTA format. (same as to_s)

    # File lib/bio/db/fasta.rb
141 def entry
142   @entry = ">#{@definition}\n#{@data.strip}\n"
143 end
Also aliased as: to_s
entry_id() click to toggle source

Parsing FASTA Defline (using identifiers method), and shows a possibly unique identifier. It returns a string.

    # File lib/bio/db/fasta.rb
253 def entry_id
254   identifiers.entry_id
255 end
fasta(factory)
Alias for: query
first_name() click to toggle source

Returns the first name (word) of the definition line - everything before the first whitespace.

>abc def #=> 'abc'
>gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c] #=> 'gi|398365175|ref|NP_009718.3|'
>abc #=> 'abc'
    # File lib/bio/db/fasta.rb
294 def first_name
295   index = definition.index(/\s/)
296   if index.nil?
297     return @definition
298   else
299     return @definition[0...index]
300   end
301 end
gi() click to toggle source

Parsing FASTA Defline (using identifiers method), and shows GI/locus/accession/accession with version number. If a entry has more than two of such IDs, only the first ID are shown. It returns a string or nil.

    # File lib/bio/db/fasta.rb
262 def gi
263   identifiers.gi
264 end
identifiers() click to toggle source

Parsing FASTA Defline, and extract IDs. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or “:”-separated IDs. It returns a Bio::FastaDefline instance.

    # File lib/bio/db/fasta.rb
243 def identifiers
244   unless defined?(@ids) then
245     @ids = FastaDefline.new(@definition)
246   end
247   @ids
248 end
length() click to toggle source

Returns sequence length.

    # File lib/bio/db/fasta.rb
203 def length
204   seq.length
205 end
locus() click to toggle source

Returns locus.

    # File lib/bio/db/fasta.rb
284 def locus
285   identifiers.locus
286 end
nalen() click to toggle source

Returens the length of Bio::Sequence::NA.

    # File lib/bio/db/fasta.rb
213 def nalen
214   self.naseq.length
215 end
naseq() click to toggle source

Returens the Bio::Sequence::NA.

    # File lib/bio/db/fasta.rb
208 def naseq
209   Sequence::NA.new(seq)
210 end
query(factory) click to toggle source

Executes FASTA/BLAST search by using a Bio::Fasta or a Bio::Blast factory object.

#!/usr/bin/env ruby
require 'bio'

factory = Bio::Fasta.local('fasta34', 'db/swissprot.f')
flatfile = Bio::FlatFile.open(Bio::FastaFormat, 'queries.f')
flatfile.each do |entry|
  p entry.definition
  result = entry.fasta(factory)
  result.each do |hit|
    print "#{hit.query_id} : #{hit.evalue}\t#{hit.target_id} at "
    p hit.lap_at
  end
end
    # File lib/bio/db/fasta.rb
164 def query(factory)
165   factory.query(entry)
166 end
Also aliased as: fasta, blast
seq() click to toggle source

Returns a joined sequence line as a String.

    # File lib/bio/db/fasta.rb
171 def seq
172   unless defined?(@seq)
173     unless /\A\s*^\#/ =~ @data then
174       @seq = Sequence::Generic.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up
175     else
176       a = @data.split(/(^\#.*$)/)
177       i = 0
178       cmnt = {}
179       s = []
180       a.each do |x|
181         if /^# ?(.*)$/ =~ x then
182           cmnt[i] ? cmnt[i] << "\n" << $1 : cmnt[i] = $1
183         else
184           x.tr!(" \t\r\n0-9", '') # lazy clean up
185           i += x.length
186           s << x
187         end
188       end
189       @comment = cmnt
190       @seq = Bio::Sequence::Generic.new(s.join(''))
191     end
192   end
193   @seq
194 end
to_biosequence() click to toggle source

Returns sequence as a Bio::Sequence object.

Note: If you modify the returned Bio::Sequence object, the sequence or definition in this FastaFormat object might also be changed (but not always be changed) because of efficiency.

    # File lib/bio/db/fasta.rb
234 def to_biosequence
235   Bio::Sequence.adapter(self, Bio::Sequence::Adapter::FastaFormat)
236 end
Also aliased as: to_seq
to_s()
Alias for: entry
to_seq()
Alias for: to_biosequence