class Bio::FastaDefline

Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or “:”-separated IDs.

specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers

Examples

rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
rub.entry_id       ==> 'gi|671595'
rub.get('emb')     ==> 'CAA85678.1'
rub.emb            ==> 'CAA85678.1'
rub.gi             ==> '671595'
rub.accession      ==> 'CAA85678'
rub.accessions     ==> [ 'CAA85678' ]
rub.acc_version    ==> 'CAA85678.1'
rub.locus          ==> nil
rub.list_ids       ==> [["gi", "671595"],
                        ["emb", "CAA85678.1", nil],
                        ["Perovskia abrotanoides"]]

ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
ckr.entry_id      ==> "gi|2495000"
ckr.sp            ==> "CCKR_CAVPO"
ckr.pir           ==> "I51898"
ckr.gb            ==> "AAB29504.1"
ckr.gi            ==> "2495000"
ckr.accession     ==> "AAB29504"
ckr.accessions    ==> ["Q63931", "AAB29504"]
ckr.acc_version   ==> "AAB29504.1"
ckr.locus         ==> nil
ckr.description   ==>
  "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
ckr.descriptions  ==>
  ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
   "cholecystokinin A receptor - guinea pig",
   "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
ckr.words         ==> 
  ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
   "receptor", "type"]
ckr.id_strings    ==>
  ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
   "544724", "AAB29504.1", "Cavia"]
ckr.list_ids      ==>
  [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
   ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
   ["gb", "AAB29504.1", nil], ["Cavia"]]

References

Constants

KillRegexpArray
KillWords
KillWordsHash
NSIDs

Attributes

entry_id[R]

Shows a possibly unique identifier. Returns a string.

list_ids[R]

Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings.

Public Class Methods

new(str) click to toggle source

Parses given string.

# File lib/bio/db/fasta/defline.rb, line 180
def initialize(str)
  @deflines = []
  @info = {}
  @list_ids = []

  @entry_id = nil

  lines = str.split("\x01")
  lines.each do |line|
    add_defline(line)
  end
end

Public Instance Methods

acc_version() click to toggle source

Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

# File lib/bio/db/fasta/defline.rb, line 492
def acc_version
  unless defined?(@acc_version) then
    @acc_version = get_by_type('acc_version')
  end
  @acc_version
end
accession() click to toggle source

Shows an accession number.

# File lib/bio/db/fasta/defline.rb, line 510
def accession
  unless defined?(@accession) then
    if acc_version then
      @accession = acc_version.split('.')[0]
    else
      @accession = accessions[0]
    end
  end
  @accession
end
accessions() click to toggle source

Shows accession numbers. Returns an array of strings.

# File lib/bio/db/fasta/defline.rb, line 501
def accessions
  unless defined?(@accessions) then
    @accessions = get_all_by_type('accession', 'acc_version')
    @accessions.collect! { |x| x.sub(/\..*\z/, '') }
  end
  @accessions
end
add_defline(str) click to toggle source

Parses given string and adds parsed data.

# File lib/bio/db/fasta/defline.rb, line 194
def add_defline(str)
  case str
  when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
    # NSIDs
    # examples:
    # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
    #
    # note: regexp (:?) means grouping without backreferences
    i = $1
    d = $2
    tks = i.split('|')
    tks << '' if i[-1,1] == '|'
    a = parse_NSIDs(tks)
    i = a[0].join('|')
    a.unshift('|')
    d = tks.join('|') + ' ' + d unless tks.empty?
    a << d
    this_line = a
    match_EC(d)
    parse_square_brackets(d).each do |x|
      if !match_EC(x, false) and x =~ /\A[A-Z]/ then
        di = [  x ]
        @list_ids << di
        @info['organism'] = x unless @info['organism']
      end
    end

  when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
    # examples:
    # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
    # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
    i = $1
    d = $2
    a = parse_ColonSepID(i)
    i = a.join(':')
    this_line = [ ':', a , d ]
    match_EC(d)
    parse_square_brackets(d).each do |x|
      if !match_EC(x, false) and x =~ /:/ then
        parse_ColonSepID(x)
      elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
        @list_ids << [ $1 ]
      end
    end

  when /^\>?\s*(\S+)(?:\s+(.+))?$/
    # examples:
    # >ABC12345 this is test
    i = $1
    d = $2.to_s
    @list_ids << [ i.chomp('.') ]
    this_line = [  '', [ i ], d ]
    match_EC(d)
  else
    i = str
    d = ''
    match_EC(i)
    this_line = [ '', [ i ], d ]
  end

  @deflines << this_line
  @entry_id = i unless @entry_id
end
description() click to toggle source

Shows description.

# File lib/bio/db/fasta/defline.rb, line 335
def description
  @deflines[0].to_a[-1]
end
descriptions() click to toggle source

Returns descriptions.

# File lib/bio/db/fasta/defline.rb, line 340
def descriptions
  @deflines.collect do |a|
    a[-1]
  end
end
get(dbname) click to toggle source

Returns identifires by a database name.

# File lib/bio/db/fasta/defline.rb, line 416
def get(dbname)
  db = dbname.to_s
  r = nil
  unless r = @info[db] then
    di = @list_ids.find { |x| x[0] == db.to_s }
    if di and di.size <= 2 then
      r = di[-1]
    elsif di then
      labels = self.class::NSIDs[db]
      [ 'acc_version', 'entry_id',
        'locus', 'accession', 'number'].each do |x|
        if i = labels.index(x) then
          r = di[i+1]
          break if r
        end
      end
      r = di[1..-1].find { |x| x } unless r
    end
    @info[db] = r if r
  end
  r
end
get_all_by_type(*type_strarg) click to toggle source

Returns identifiers by given type.

# File lib/bio/db/fasta/defline.rb, line 452
def get_all_by_type(*type_strarg)
  d = []
  @list_ids.each do |x|
    if labels = self.class::NSIDs[x[0]] then
      type_strarg.each do |y|
        if i = labels.index(y) then
          d << x[i+1] if x[i+1]
        end
      end
    end
  end
  d
end
get_by_type(type_str) click to toggle source

Returns an identifier by given type.

# File lib/bio/db/fasta/defline.rb, line 440
def get_by_type(type_str)
  @list_ids.each do |x|
    if labels = self.class::NSIDs[x[0]] then
      if i = labels.index(type_str) then
        return x[i+1]
      end
    end
  end
  nil
end
gi() click to toggle source

Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

# File lib/bio/db/fasta/defline.rb, line 481
def gi
  unless defined?(@gi) then
    @gi = get_by_type('gi')
  end
  @gi
end
id_strings() click to toggle source

Shows ID-like strings. Returns an array of strings.

# File lib/bio/db/fasta/defline.rb, line 348
def id_strings
  r = []
  @list_ids.each do |a|
    if a.size >= 2 then
      r.concat a[1..-1].find_all { |x| x }
    else
      if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
        r << a[0]
      end
    end
  end
  r.concat( words(true, []).find_all do |x|
             x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
               x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
           end)
  r
end
locus() click to toggle source

Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

# File lib/bio/db/fasta/defline.rb, line 470
def locus
  unless defined?(@locus)
    @locus = get_by_type('locus')
  end
  @locus
end
method_missing(name, *args) click to toggle source
# File lib/bio/db/fasta/defline.rb, line 521
def method_missing(name, *args)
  # raise ArgumentError,
  # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
  r = get(name, *args)
  if !r and !(self.class::NSIDs[name.to_s]) then
    raise "NameError: undefined method `#{name.inspect}'"
  end
  r
end
to_s() click to toggle source

Shows original string. Note that the result of this method may be different from original string which is given in ::new method.

# File lib/bio/db/fasta/defline.rb, line 327
def to_s
  @deflines.collect { |a|
    s = a[0]
    (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
  }.join("\x01")
end
words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray, kwhash = self.class::KillWordsHash) click to toggle source

Shows words used in the defline. Returns an Array.

# File lib/bio/db/fasta/defline.rb, line 390
def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
          kwhash = self.class::KillWordsHash)
  a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\\`\~\/\|\?\!\&\@\# \x00-\x1f\x7f]+/)
  a.collect! do |x|
    x.sub!(/\A[\$\*\-\+]+/, '')
    x.sub!(/[\$\*\-\=]+\z/, '')
    if x.size <= 1 then
      nil
    elsif kwhash[x.downcase] then
      nil
    else
      if kill_regexp.find { |expr| expr =~ x } then
        nil
      else
        x
      end
    end
  end
  a.compact!
  a.collect! { |x| x.downcase } unless case_sensitive
  a.sort!
  a.uniq!
  a
end