class Bio::FastaDefline

Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or “:”-separated IDs.

specs are described in:


rub ='>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
rub.entry_id       ==> 'gi|671595'
rub.get('emb')     ==> 'CAA85678.1'
rub.emb            ==> 'CAA85678.1'             ==> '671595'
rub.accession      ==> 'CAA85678'
rub.accessions     ==> [ 'CAA85678' ]
rub.acc_version    ==> 'CAA85678.1'          ==> nil
rub.list_ids       ==> [["gi", "671595"],
                        ["emb", "CAA85678.1", nil],
                        ["Perovskia abrotanoides"]]

ckr =">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
ckr.entry_id      ==> "gi|2495000"
ckr.sp            ==> "CCKR_CAVPO"
ckr.pir           ==> "I51898"            ==> "AAB29504.1"            ==> "2495000"
ckr.accession     ==> "AAB29504"
ckr.accessions    ==> ["Q63931", "AAB29504"]
ckr.acc_version   ==> "AAB29504.1"         ==> nil
ckr.description   ==>
ckr.descriptions  ==>
   "cholecystokinin A receptor - guinea pig",
   "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
ckr.words         ==> 
  ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
   "receptor", "type"]
ckr.id_strings    ==>
  ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
   "544724", "AAB29504.1", "Cavia"]
ckr.list_ids      ==>
  [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
   ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
   ["gb", "AAB29504.1", nil], ["Cavia"]]






Shows a possibly unique identifier. Returns a string.


Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings.

Public Class Methods

new(str)

Parses given string.

    # File lib/bio/db/fasta/defline.rb
180 def initialize(str)
181   @deflines = []
182   @info = {}
183   @list_ids = []
185   @entry_id = nil
187   lines = str.split("\x01")
188   lines.each do |line|
189     add_defline(line)
190   end
191 end

Public Instance Methods

acc_version()

Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

    # File lib/bio/db/fasta/defline.rb
492 def acc_version
493   unless defined?(@acc_version) then
494     @acc_version = get_by_type('acc_version')
495   end
496   @acc_version
497 end
accession()

Shows an accession number.

    # File lib/bio/db/fasta/defline.rb
510 def accession
511   unless defined?(@accession) then
512     if acc_version then
513       @accession = acc_version.split('.')[0]
514     else
515       @accession = accessions[0]
516     end
517   end
518   @accession
519 end
accessions()

Shows accession numbers. Returns an array of strings.

    # File lib/bio/db/fasta/defline.rb
501 def accessions
502   unless defined?(@accessions) then
503     @accessions = get_all_by_type('accession', 'acc_version')
504     @accessions.collect! { |x| x.sub(/\..*\z/, '') }
505   end
506   @accessions
507 end
add_defline(str)

Parses given string and adds parsed data.

    # File lib/bio/db/fasta/defline.rb
194 def add_defline(str)
195   case str
196   when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
197     # NSIDs
198     # examples:
199     # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
200     #
201     # note: regexp (:?) means grouping without backreferences
202     i = $1
203     d = $2
204     tks = i.split('|')
205     tks << '' if i[-1,1] == '|'
206     a = parse_NSIDs(tks)
207     i = a[0].join('|')
208     a.unshift('|')
209     d = tks.join('|') + ' ' + d unless tks.empty?
210     a << d
211     this_line = a
212     match_EC(d)
213     parse_square_brackets(d).each do |x|
214       if !match_EC(x, false) and x =~ /\A[A-Z]/ then
215         di = [  x ]
216         @list_ids << di
217         @info['organism'] = x unless @info['organism']
218       end
219     end
221   when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
222     # examples:
223     # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
224     # >emb:CACDC28 [X80034] C.albicans CDC28 gene
225     i = $1
226     d = $2
227     a = parse_ColonSepID(i)
228     i = a.join(':')
229     this_line = [ ':', a , d ]
230     match_EC(d)
231     parse_square_brackets(d).each do |x|
232       if !match_EC(x, false) and x =~ /:/ then
233         parse_ColonSepID(x)
234       elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
235         @list_ids << [ $1 ]
236       end
237     end
239   when /^\>?\s*(\S+)(?:\s+(.+))?$/
240     # examples:
241     # >ABC12345 this is test
242     i = $1
243     d = $2.to_s
244     @list_ids << [ i.chomp('.') ]
245     this_line = [  '', [ i ], d ]
246     match_EC(d)
247   else
248     i = str
249     d = ''
250     match_EC(i)
251     this_line = [ '', [ i ], d ]
252   end
254   @deflines << this_line
255   @entry_id = i unless @entry_id
256 end
description()

Shows description.

    # File lib/bio/db/fasta/defline.rb
335 def description
336   @deflines[0].to_a[-1]
337 end
descriptions()

Returns descriptions.

    # File lib/bio/db/fasta/defline.rb
340 def descriptions
341   @deflines.collect do |a|
342     a[-1]
343   end
344 end
get(dbname)

Returns identifires by a database name.

    # File lib/bio/db/fasta/defline.rb
416 def get(dbname)
417   db = dbname.to_s
418   r = nil
419   unless r = @info[db] then
420     di = @list_ids.find { |x| x[0] == db.to_s }
421     if di and di.size <= 2 then
422       r = di[-1]
423     elsif di then
424       labels = self.class::NSIDs[db]
425       [ 'acc_version', 'entry_id',
426         'locus', 'accession', 'number'].each do |x|
427         if i = labels.index(x) then
428           r = di[i+1]
429           break if r
430         end
431       end
432       r = di[1..-1].find { |x| x } unless r
433     end
434     @info[db] = r if r
435   end
436   r
437 end
get_all_by_type(*type_strarg)

Returns identifiers by given type.

    # File lib/bio/db/fasta/defline.rb
452 def get_all_by_type(*type_strarg)
453   d = []
454   @list_ids.each do |x|
455     if labels = self.class::NSIDs[x[0]] then
456       type_strarg.each do |y|
457         if i = labels.index(y) then
458           d << x[i+1] if x[i+1]
459         end
460       end
461     end
462   end
463   d
464 end
get_by_type(type_str)

Returns an identifier by given type.

    # File lib/bio/db/fasta/defline.rb
440 def get_by_type(type_str)
441   @list_ids.each do |x|
442     if labels = self.class::NSIDs[x[0]] then
443       if i = labels.index(type_str) then
444         return x[i+1]
445       end
446     end
447   end
448   nil
449 end
gi()

Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

    # File lib/bio/db/fasta/defline.rb
481 def gi
482   unless defined?(@gi) then
483     @gi = get_by_type('gi')
484   end
485   @gi
486 end
id_strings()

Shows ID-like strings. Returns an array of strings.

    # File lib/bio/db/fasta/defline.rb
348 def id_strings
349   r = []
350   @list_ids.each do |a|
351     if a.size >= 2 then
352       r.concat a[1..-1].find_all { |x| x }
353     else
354       if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
355         r << a[0]
356       end
357     end
358   end
359   r.concat( words(true, []).find_all do |x|
360              x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
361                x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
362            end)
363   r
364 end
locus()

Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

    # File lib/bio/db/fasta/defline.rb
470 def locus
471   unless defined?(@locus)
472     @locus = get_by_type('locus')
473   end
474   @locus
475 end
method_missing(name, *args) click to toggle source
    # File lib/bio/db/fasta/defline.rb
521 def method_missing(name, *args)
522   # raise ArgumentError,
523   # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
524   r = get(name, *args)
525   if !r and !(self.class::NSIDs[name.to_s]) then
526     raise "NameError: undefined method `#{name.inspect}'"
527   end
528   r
529 end
to_s()

Shows original string. Note that the result of this method may be different from original string which is given in method.

    # File lib/bio/db/fasta/defline.rb
327 def to_s
328   @deflines.collect { |a|
329     s = a[0]
330     (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
331   }.join("\x01")
332 end
words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray, kwhash = self.class::KillWordsHash)

Shows words used in the defline. Returns an Array.

    # File lib/bio/db/fasta/defline.rb
390 def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
391           kwhash = self.class::KillWordsHash)
392   a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\# \x00-\x1f\x7f]+/)
393   a.collect! do |x|
394     x.sub!(/\A[\$\*\-\+]+/, '')
395     x.sub!(/[\$\*\-\=]+\z/, '')
396     if x.size <= 1 then
397       nil
398     elsif kwhash[x.downcase] then
399       nil
400     else
401       if kill_regexp.find { |expr| expr =~ x } then
402         nil
403       else
404         x
405       end
406     end
407   end
408   a.compact!
409   a.collect! { |x| x.downcase } unless case_sensitive
410   a.sort!
411   a.uniq!
412   a
413 end