class Bio::FastaDefline

Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or “:”-separated IDs.

specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers

Examples¶ ↑

rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
rub.entry_id       ==> 'gi|671595'
rub.get('emb')     ==> 'CAA85678.1'
rub.emb            ==> 'CAA85678.1'
rub.gi             ==> '671595'
rub.accession      ==> 'CAA85678'
rub.accessions     ==> [ 'CAA85678' ]
rub.acc_version    ==> 'CAA85678.1'
rub.locus          ==> nil
rub.list_ids       ==> [["gi", "671595"],
                        ["emb", "CAA85678.1", nil],
                        ["Perovskia abrotanoides"]]

ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
ckr.entry_id      ==> "gi|2495000"
ckr.sp            ==> "CCKR_CAVPO"
ckr.pir           ==> "I51898"
ckr.gb            ==> "AAB29504.1"
ckr.gi            ==> "2495000"
ckr.accession     ==> "AAB29504"
ckr.accessions    ==> ["Q63931", "AAB29504"]
ckr.acc_version   ==> "AAB29504.1"
ckr.locus         ==> nil
ckr.description   ==>
  "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
ckr.descriptions  ==>
  ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
   "cholecystokinin A receptor - guinea pig",
   "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
ckr.words         ==> 
  ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
   "receptor", "type"]
ckr.id_strings    ==>
  ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
   "544724", "AAB29504.1", "Cavia"]
ckr.list_ids      ==>
  [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
   ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
   ["gb", "AAB29504.1", nil], ["Cavia"]]

`References`¶ ↑

Fasta format description (NCBI) www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.) (Dead link. Please find in web.archive.org/ ). blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
Program Parameters for formatdb and fastacmd (by Tao Tao) www.ncbi.nlm.nih.gov/staff/tao/URLAPI/formatdb_fastacmd.html#t1.1
Formatdb README ftp.ncbi.nih.gov/blast/documents/formatdb.html

Constants

KillRegexpArray
KillWords
KillWordsHash
NSIDs

Attributes

entry_id[R]

Shows a possibly unique identifier. Returns a string.

list_ids[R]

Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings.

Public Class Methods

new(str) click to toggle source

Parses given string.

    # File lib/bio/db/fasta/defline.rb
180 def initialize(str)
181   @deflines = []
182   @info = {}
183   @list_ids = []
184 
185   @entry_id = nil
186 
187   lines = str.split("\x01")
188   lines.each do |line|
189     add_defline(line)
190   end
191 end

Public Instance Methods

acc_version() click to toggle source

Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

    # File lib/bio/db/fasta/defline.rb
492 def acc_version
493   unless defined?(@acc_version) then
494     @acc_version = get_by_type('acc_version')
495   end
496   @acc_version
497 end

accession() click to toggle source

Shows an accession number.

    # File lib/bio/db/fasta/defline.rb
510 def accession
511   unless defined?(@accession) then
512     if acc_version then
513       @accession = acc_version.split('.')[0]
514     else
515       @accession = accessions[0]
516     end
517   end
518   @accession
519 end

accessions() click to toggle source

Shows accession numbers. Returns an array of strings.

    # File lib/bio/db/fasta/defline.rb
501 def accessions
502   unless defined?(@accessions) then
503     @accessions = get_all_by_type('accession', 'acc_version')
504     @accessions.collect! { |x| x.sub(/\..*\z/, '') }
505   end
506   @accessions
507 end

add_defline(str) click to toggle source

Parses given string and adds parsed data.

    # File lib/bio/db/fasta/defline.rb
194 def add_defline(str)
195   case str
196   when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
197     # NSIDs
198     # examples:
199     # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
200     #
201     # note: regexp (:?) means grouping without backreferences
202     i = $1
203     d = $2
204     tks = i.split('|')
205     tks << '' if i[-1,1] == '|'
206     a = parse_NSIDs(tks)
207     i = a[0].join('|')
208     a.unshift('|')
209     d = tks.join('|') + ' ' + d unless tks.empty?
210     a << d
211     this_line = a
212     match_EC(d)
213     parse_square_brackets(d).each do |x|
214       if !match_EC(x, false) and x =~ /\A[A-Z]/ then
215         di = [  x ]
216         @list_ids << di
217         @info['organism'] = x unless @info['organism']
218       end
219     end
220 
221   when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
222     # examples:
223     # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
224     # >emb:CACDC28 [X80034] C.albicans CDC28 gene
225     i = $1
226     d = $2
227     a = parse_ColonSepID(i)
228     i = a.join(':')
229     this_line = [ ':', a , d ]
230     match_EC(d)
231     parse_square_brackets(d).each do |x|
232       if !match_EC(x, false) and x =~ /:/ then
233         parse_ColonSepID(x)
234       elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
235         @list_ids << [ $1 ]
236       end
237     end
238 
239   when /^\>?\s*(\S+)(?:\s+(.+))?$/
240     # examples:
241     # >ABC12345 this is test
242     i = $1
243     d = $2.to_s
244     @list_ids << [ i.chomp('.') ]
245     this_line = [  '', [ i ], d ]
246     match_EC(d)
247   else
248     i = str
249     d = ''
250     match_EC(i)
251     this_line = [ '', [ i ], d ]
252   end
253 
254   @deflines << this_line
255   @entry_id = i unless @entry_id
256 end

description() click to toggle source

Shows description.

    # File lib/bio/db/fasta/defline.rb
335 def description
336   @deflines[0].to_a[-1]
337 end

descriptions() click to toggle source

Returns descriptions.

    # File lib/bio/db/fasta/defline.rb
340 def descriptions
341   @deflines.collect do |a|
342     a[-1]
343   end
344 end

get(dbname) click to toggle source

Returns identifires by a database name.

    # File lib/bio/db/fasta/defline.rb
416 def get(dbname)
417   db = dbname.to_s
418   r = nil
419   unless r = @info[db] then
420     di = @list_ids.find { |x| x[0] == db.to_s }
421     if di and di.size <= 2 then
422       r = di[-1]
423     elsif di then
424       labels = self.class::NSIDs[db]
425       [ 'acc_version', 'entry_id',
426         'locus', 'accession', 'number'].each do |x|
427         if i = labels.index(x) then
428           r = di[i+1]
429           break if r
430         end
431       end
432       r = di[1..-1].find { |x| x } unless r
433     end
434     @info[db] = r if r
435   end
436   r
437 end

get_all_by_type(*type_strarg) click to toggle source

Returns identifiers by given type.

    # File lib/bio/db/fasta/defline.rb
452 def get_all_by_type(*type_strarg)
453   d = []
454   @list_ids.each do |x|
455     if labels = self.class::NSIDs[x[0]] then
456       type_strarg.each do |y|
457         if i = labels.index(y) then
458           d << x[i+1] if x[i+1]
459         end
460       end
461     end
462   end
463   d
464 end

get_by_type(type_str) click to toggle source

Returns an identifier by given type.

    # File lib/bio/db/fasta/defline.rb
440 def get_by_type(type_str)
441   @list_ids.each do |x|
442     if labels = self.class::NSIDs[x[0]] then
443       if i = labels.index(type_str) then
444         return x[i+1]
445       end
446     end
447   end
448   nil
449 end

gi() click to toggle source

Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

    # File lib/bio/db/fasta/defline.rb
481 def gi
482   unless defined?(@gi) then
483     @gi = get_by_type('gi')
484   end
485   @gi
486 end

id_strings() click to toggle source

Shows ID-like strings. Returns an array of strings.

    # File lib/bio/db/fasta/defline.rb
348 def id_strings
349   r = []
350   @list_ids.each do |a|
351     if a.size >= 2 then
352       r.concat a[1..-1].find_all { |x| x }
353     else
354       if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
355         r << a[0]
356       end
357     end
358   end
359   r.concat( words(true, []).find_all do |x|
360              x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
361                x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
362            end)
363   r
364 end

locus() click to toggle source

Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

    # File lib/bio/db/fasta/defline.rb
470 def locus
471   unless defined?(@locus)
472     @locus = get_by_type('locus')
473   end
474   @locus
475 end

method_missing(name, *args) click to toggle source

    # File lib/bio/db/fasta/defline.rb
521 def method_missing(name, *args)
522   # raise ArgumentError,
523   # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
524   r = get(name, *args)
525   if !r and !(self.class::NSIDs[name.to_s]) then
526     raise "NameError: undefined method `#{name.inspect}'"
527   end
528   r
529 end

to_s() click to toggle source

Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.

    # File lib/bio/db/fasta/defline.rb
327 def to_s
328   @deflines.collect { |a|
329     s = a[0]
330     (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
331   }.join("\x01")
332 end

words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray, kwhash = self.class::KillWordsHash) click to toggle source

Shows words used in the defline. Returns an Array.

    # File lib/bio/db/fasta/defline.rb
390 def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
391           kwhash = self.class::KillWordsHash)
392   a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\# \x00-\x1f\x7f]+/)
393   a.collect! do |x|
394     x.sub!(/\A[\$\*\-\+]+/, '')
395     x.sub!(/[\$\*\-\=]+\z/, '')
396     if x.size <= 1 then
397       nil
398     elsif kwhash[x.downcase] then
399       nil
400     else
401       if kill_regexp.find { |expr| expr =~ x } then
402         nil
403       else
404         x
405       end
406     end
407   end
408   a.compact!
409   a.collect! { |x| x.downcase } unless case_sensitive
410   a.sort!
411   a.uniq!
412   a
413 end

class Bio::FastaDefline

Examples¶ ↑

References¶ ↑

`References`¶ ↑