class Bio::FlatFile::AutoDetect

AutoDetect automatically determines database class of given data.

Constants

BottomRule

Special element that is always bottom priority.

TopRule

Special element that is always top priority.

Public Class Methods

[](*arg) click to toggle source

make a new autodetect object

# File lib/bio/io/flatfile/autodetection.rb, line 361
def self.[](*arg)
  a = self.new
  arg.each { |e| a.add(e) }
  a
end
default() click to toggle source

returns the default autodetect object

# File lib/bio/io/flatfile/autodetection.rb, line 348
def self.default
  unless @default then
    @default = self.make_default
  end
  @default
end
default=(ad) click to toggle source

sets the default autodetect object.

# File lib/bio/io/flatfile/autodetection.rb, line 356
def self.default=(ad)
  @default = ad
end
make_default() click to toggle source

make a default of default autodetect object

# File lib/bio/io/flatfile/autodetection.rb, line 368
def self.make_default
  a = self[
    genbank  = RuleRegexp[ 'Bio::GenBank',
      /^LOCUS       .+ bp .*[a-z]*[DR]?NA/ ],
    genpept  = RuleRegexp[ 'Bio::GenPept',
      /^LOCUS       .+ aa .+/ ],
    medline  = RuleRegexp[ 'Bio::MEDLINE',
      /^PMID\- [0-9]+$/ ],
    embl     = RuleRegexp[ 'Bio::EMBL',
      /^ID   .+\; .*(DNA|RNA|XXX)\;/ ],
    sptr     = RuleRegexp2[ 'Bio::SPTR',
      /^ID   .+\; *PRT\;/,
      /^ID   [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ],
    prosite  = RuleRegexp[ 'Bio::PROSITE',
      /^ID   [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ],
    transfac = RuleRegexp[ 'Bio::TRANSFAC',
      /^AC  [-A-Za-z0-9_\.]+$/ ],

    aaindex  = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text|
      if /^H [-A-Z0-9_\.]+$/ =~ text then
        if text =~ /^M [rc]/ then
          Bio::AAindex2
        elsif text =~ /^I    A\/L/ then
          Bio::AAindex1
        else
          false #fail to determine
        end
      else
        nil
      end
    end,

    litdb    = RuleRegexp[ 'Bio::LITDB',
      /^CODE        [0-9]+$/ ],
    pathway_module = RuleRegexp[ 'Bio::KEGG::MODULE',
      /^ENTRY       .+ Pathway\s+Module\s*/ ],
    pathway  = RuleRegexp[ 'Bio::KEGG::PATHWAY',
      /^ENTRY       .+ Pathway\s*/ ],
    brite    = RuleRegexp[ 'Bio::KEGG::BRITE',
      /^Entry           [A-Z0-9]+/ ],
    orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY',
      /^ENTRY       .+ KO\s*/ ],
    drug     = RuleRegexp[ 'Bio::KEGG::DRUG',
      /^ENTRY       .+ Drug\s*/ ],
    glycan   = RuleRegexp[ 'Bio::KEGG::GLYCAN',
      /^ENTRY       .+ Glycan\s*/ ],
    enzyme   = RuleRegexp2[ 'Bio::KEGG::ENZYME',
      /^ENTRY       EC [0-9\.]+$/,
      /^ENTRY       .+ Enzyme\s*/
    ],
    compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND',
      /^ENTRY       C[A-Za-z0-9\._]+$/,
      /^ENTRY       .+ Compound\s*/
    ],
    reaction = RuleRegexp2[ 'Bio::KEGG::REACTION',
      /^ENTRY       R[A-Za-z0-9\._]+$/,
      /^ENTRY       .+ Reaction\s*/
    ],
    genes    = RuleRegexp[ 'Bio::KEGG::GENES',
      /^ENTRY       .+ (CDS|gene|.*RNA|Contig) / ],
    genome   = RuleRegexp[ 'Bio::KEGG::GENOME',
      /^ENTRY       [a-z]+$/ ],

    fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster',
                          'Bio::FANTOM::MaXML::Sequence') do |text|
      if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text
        case $1
        when 'clusters'
          Bio::FANTOM::MaXML::Cluster
        when 'sequences'
          Bio::FANTOM::MaXML::Sequence
        else
          nil #unknown
        end
      else
        nil
      end
    end,

    pdb = RuleRegexp[ 'Bio::PDB',
      /^HEADER    .{40}\d\d\-[A-Z]{3}\-\d\d   [0-9A-Z]{4}/ ],
    het = RuleRegexp[ 'Bio::PDB::ChemicalComponent',
      /^RESIDUE +.+ +\d+\s*$/ ],

    clustal = RuleRegexp2[ 'Bio::ClustalW::Report',
    /^CLUSTAL .*\(.*\).*sequence +alignment/,
    /^CLUSTAL FORMAT for T-COFFEE/ ],

    gcg_msf = RuleRegexp[ 'Bio::GCG::Msf',
    /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ],

    gcg_seq = RuleRegexp[ 'Bio::GCG::Seq',
    /^!!(N|A)A_SEQUENCE .+/ ],

    blastxml = RuleRegexp[ 'Bio::Blast::Report',
      /\<\!DOCTYPE BlastOutput PUBLIC / ],
    wublast  = RuleRegexp[ 'Bio::Blast::WU::Report',
      /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
    wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast',
      /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
    blast    = RuleRegexp[ 'Bio::Blast::Default::Report',
      /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
    tblast   = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast',
      /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
    rpsblast   = RuleRegexp[ 'Bio::Blast::RPSBlast::Report',
      /^RPS\-BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],

    blat   = RuleRegexp[ 'Bio::Blat::Report',
      /^psLayout version \d+/ ],
    spidey = RuleRegexp[ 'Bio::Spidey::Report',
      /^\-\-SPIDEY version .+\-\-$/ ],
    hmmer  = RuleRegexp[ 'Bio::HMMER::Report',
      /^HMMER +\d+\./ ],
    sim4   = RuleRegexp[ 'Bio::Sim4::Report',
      /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ],

    fastq  = RuleRegexp[ 'Bio::Fastq',
      /^\@.+(?:\r|\r?\n)(?:[^\@\+].*(?:\r|\r?\n))+/ ],

    fastaformat = RuleProc.new('Bio::FastaFormat',
                               'Bio::NBRF',
                               'Bio::FastaNumericFormat') do |text|
      if /^>.+$/ =~ text
        case text
        when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/
          Bio::NBRF
        when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/
            Bio::FastaFormat
        when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/
          Bio::FastaNumericFormat
        else
          false
        end
      else
        nil
      end
    end
  ]

  # dependencies
  # NCBI
  genbank.is_prior_to genpept
  # EMBL/UniProt
  embl.is_prior_to sptr
  sptr.is_prior_to prosite
  prosite.is_prior_to transfac
  # KEGG
  #aaindex.is_prior_to litdb
  #litdb.is_prior_to brite
  pathway_module.is_prior_to pathway
  pathway.is_prior_to brite
  brite.is_prior_to orthology
  orthology.is_prior_to drug
  drug.is_prior_to glycan
  glycan.is_prior_to enzyme
  enzyme.is_prior_to compound
  compound.is_prior_to reaction
  reaction.is_prior_to genes
  genes.is_prior_to genome
  # PDB
  pdb.is_prior_to het
  # BLAST
  wublast.is_prior_to wutblast
  wutblast.is_prior_to blast
  blast.is_prior_to tblast
  # Fastq
  BottomRule.is_prior_to(fastq)
  fastq.is_prior_to(fastaformat)
  # FastaFormat
  BottomRule.is_prior_to(fastaformat)

  # for debug
  #debug_first = RuleDebug.new('debug_first')
  #a.add(debug_first)
  #debug_first.is_prior_to(TopRule)

  ## for debug
  #debug_last = RuleDebug.new('debug_last')
  #a.add(debug_last)
  #BottomRule.is_prior_to(debug_last)
  #fastaformat.is_prior_to(debug_last)

  a.rehash
  return a
end
new() click to toggle source

Creates a new Autodetect object

# File lib/bio/io/flatfile/autodetection.rb, line 226
def initialize
  # stores autodetection rules.
  @rules = Hash.new
  # stores elements (cache)
  @elements = nil
  self.add(TopRule)
  self.add(BottomRule)
end

Public Instance Methods

add(elem) click to toggle source

Adds a new element. Returns elem.

# File lib/bio/io/flatfile/autodetection.rb, line 237
def add(elem)
  raise 'element name conflicts' if @rules[elem.name]
  @elements = nil
  @rules[elem.name] = elem
  elem
end
autodetect(text, meta = {}) click to toggle source

Autodetect from the text. Returns a database class if succeeded. Returns nil if failed.

# File lib/bio/io/flatfile/autodetection.rb, line 305
def autodetect(text, meta = {})
  r = nil
  elements.each do |e|
    #$stderr.puts e.name
    r = e.guess(text, meta)
    break if r
  end
  r
end
autodetect_flatfile(ff, lines = 31) click to toggle source

autodetect from the FlatFile object. Returns a database class if succeeded. Returns nil if failed.

# File lib/bio/io/flatfile/autodetection.rb, line 318
def autodetect_flatfile(ff, lines = 31)
  meta = {}
  stream = ff.instance_eval { @stream }
  begin
    path = stream.path
  rescue NameError
  end
  if path then
    meta[:path] = path
    # call autodetect onece with meta and without any read action
    if r = self.autodetect(stream.prefetch_buffer, meta)
      return r
    end
  end
  # reading stream
  1.upto(lines) do |x|
    break unless line = stream.prefetch_gets
    if line.strip.size > 0 then
      if r = self.autodetect(stream.prefetch_buffer, meta)
        return r
      end
    end
  end
  return nil
end
each_rule() { |elem| ... } click to toggle source

Iterates over each element.

# File lib/bio/io/flatfile/autodetection.rb, line 298
def each_rule(&x) #:yields: elem
  elements.each(&x)
end
elements() click to toggle source

Returns current elements as an array whose order fulfills all elements' priorities.

# File lib/bio/io/flatfile/autodetection.rb, line 275
def elements
  unless @elements
    ary = tsort
    ary.reverse!
    @elements = ary
  end
  @elements
end
inspect() click to toggle source

visualizes the object (mainly for debug)

# File lib/bio/io/flatfile/autodetection.rb, line 291
def inspect
  "<#{self.class.to_s} " +
    self.elements.collect { |e| e.name.inspect }.join(' ') +
    ">"
end
rehash() click to toggle source

rebuilds the object and clears internal cache.

# File lib/bio/io/flatfile/autodetection.rb, line 285
def rehash
  @rules.rehash
  @elements = nil
end
tsort_each_child(elem) { |e| ... } click to toggle source

(required by TSort.) For a given element, yields each child (= lower priority elements) of the element.

# File lib/bio/io/flatfile/autodetection.rb, line 253
def tsort_each_child(elem)
  if elem == TopRule then
    @rules.each_value do |e|
      yield e unless e == TopRule or 
        e.lower_priority_elements.index(TopRule)
    end
  elsif elem == BottomRule then
    @rules.each_value do |e|
      yield e if e.higher_priority_elements.index(BottomRule)
    end
  else
    elem.lower_priority_elements.each do |e|
      yield e if e != BottomRule
    end
    unless elem.higher_priority_elements.index(BottomRule)
      yield BottomRule
    end
  end
end
tsort_each_node(&x) click to toggle source

(required by TSort.) For all elements, yields each element.

# File lib/bio/io/flatfile/autodetection.rb, line 246
def tsort_each_node(&x)
  @rules.each_value(&x)
end