class Bio::FlatFile::AutoDetect

AutoDetect automatically determines database class of given data.

Constants

BottomRule

Special element that is always bottom priority.

TopRule

Special element that is always top priority.

Public Class Methods

[](*arg) click to toggle source

make a new autodetect object

    # File lib/bio/io/flatfile/autodetection.rb
361 def self.[](*arg)
362   a = self.new
363   arg.each { |e| a.add(e) }
364   a
365 end
default() click to toggle source

returns the default autodetect object

    # File lib/bio/io/flatfile/autodetection.rb
348 def self.default
349   unless @default then
350     @default = self.make_default
351   end
352   @default
353 end
default=(ad) click to toggle source

sets the default autodetect object.

    # File lib/bio/io/flatfile/autodetection.rb
356 def self.default=(ad)
357   @default = ad
358 end
make_default() click to toggle source

make a default of default autodetect object

    # File lib/bio/io/flatfile/autodetection.rb
368 def self.make_default
369   a = self[
370     genbank  = RuleRegexp[ 'Bio::GenBank',
371       /^LOCUS       .+ bp .*[a-z]*[DR]?NA/ ],
372     genpept  = RuleRegexp[ 'Bio::GenPept',
373       /^LOCUS       .+ aa .+/ ],
374     medline  = RuleRegexp[ 'Bio::MEDLINE',
375       /^PMID\- [0-9]+$/ ],
376     embl     = RuleRegexp[ 'Bio::EMBL',
377       /^ID   .+\; .*(DNA|RNA|XXX)\;/ ],
378     sptr     = RuleRegexp2[ 'Bio::SPTR',
379       /^ID   .+\; *PRT\;/,
380       /^ID   [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ],
381     prosite  = RuleRegexp[ 'Bio::PROSITE',
382       /^ID   [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ],
383     transfac = RuleRegexp[ 'Bio::TRANSFAC',
384       /^AC  [-A-Za-z0-9_\.]+$/ ],
385 
386     aaindex  = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text|
387       if /^H [-A-Z0-9_\.]+$/ =~ text then
388         if text =~ /^M [rc]/ then
389           Bio::AAindex2
390         elsif text =~ /^I    A\/L/ then
391           Bio::AAindex1
392         else
393           false #fail to determine
394         end
395       else
396         nil
397       end
398     end,
399 
400     litdb    = RuleRegexp[ 'Bio::LITDB',
401       /^CODE        [0-9]+$/ ],
402     pathway_module = RuleRegexp[ 'Bio::KEGG::MODULE',
403       /^ENTRY       .+ Pathway\s+Module\s*/ ],
404     pathway  = RuleRegexp[ 'Bio::KEGG::PATHWAY',
405       /^ENTRY       .+ Pathway\s*/ ],
406     brite    = RuleRegexp[ 'Bio::KEGG::BRITE',
407       /^Entry           [A-Z0-9]+/ ],
408     orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY',
409       /^ENTRY       .+ KO\s*/ ],
410     drug     = RuleRegexp[ 'Bio::KEGG::DRUG',
411       /^ENTRY       .+ Drug\s*/ ],
412     glycan   = RuleRegexp[ 'Bio::KEGG::GLYCAN',
413       /^ENTRY       .+ Glycan\s*/ ],
414     enzyme   = RuleRegexp2[ 'Bio::KEGG::ENZYME',
415       /^ENTRY       EC [0-9\.]+$/,
416       /^ENTRY       .+ Enzyme\s*/
417     ],
418     compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND',
419       /^ENTRY       C[A-Za-z0-9\._]+$/,
420       /^ENTRY       .+ Compound\s*/
421     ],
422     reaction = RuleRegexp2[ 'Bio::KEGG::REACTION',
423       /^ENTRY       R[A-Za-z0-9\._]+$/,
424       /^ENTRY       .+ Reaction\s*/
425     ],
426     genes    = RuleRegexp[ 'Bio::KEGG::GENES',
427       /^ENTRY       .+ (CDS|gene|.*RNA|Contig) / ],
428     genome   = RuleRegexp[ 'Bio::KEGG::GENOME',
429       /^ENTRY       [a-z]+$/ ],
430 
431     fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster',
432                           'Bio::FANTOM::MaXML::Sequence') do |text|
433       if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text
434         case $1
435         when 'clusters'
436           Bio::FANTOM::MaXML::Cluster
437         when 'sequences'
438           Bio::FANTOM::MaXML::Sequence
439         else
440           nil #unknown
441         end
442       else
443         nil
444       end
445     end,
446 
447     pdb = RuleRegexp[ 'Bio::PDB',
448       /^HEADER    .{40}\d\d\-[A-Z]{3}\-\d\d   [0-9A-Z]{4}/ ],
449     het = RuleRegexp[ 'Bio::PDB::ChemicalComponent',
450       /^RESIDUE +.+ +\d+\s*$/ ],
451 
452     clustal = RuleRegexp2[ 'Bio::ClustalW::Report',
453     /^CLUSTAL .*\(.*\).*sequence +alignment/,
454     /^CLUSTAL FORMAT for T-COFFEE/ ],
455 
456     gcg_msf = RuleRegexp[ 'Bio::GCG::Msf',
457     /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ],
458 
459     gcg_seq = RuleRegexp[ 'Bio::GCG::Seq',
460     /^!!(N|A)A_SEQUENCE .+/ ],
461 
462     blastxml = RuleRegexp[ 'Bio::Blast::Report',
463       /\<\!DOCTYPE BlastOutput PUBLIC / ],
464     wublast  = RuleRegexp[ 'Bio::Blast::WU::Report',
465       /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
466     wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast',
467       /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
468     blast    = RuleRegexp[ 'Bio::Blast::Default::Report',
469       /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
470     tblast   = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast',
471       /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
472     rpsblast   = RuleRegexp[ 'Bio::Blast::RPSBlast::Report',
473       /^RPS\-BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
474 
475     blat   = RuleRegexp[ 'Bio::Blat::Report',
476       /^psLayout version \d+/ ],
477     spidey = RuleRegexp[ 'Bio::Spidey::Report',
478       /^\-\-SPIDEY version .+\-\-$/ ],
479     hmmer  = RuleRegexp[ 'Bio::HMMER::Report',
480       /^HMMER +\d+\./ ],
481     sim4   = RuleRegexp[ 'Bio::Sim4::Report',
482       /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ],
483 
484     fastq  = RuleRegexp[ 'Bio::Fastq',
485       /^\@.+(?:\r|\r?\n)(?:[^\@\+].*(?:\r|\r?\n))+/ ],
486 
487     fastaformat = RuleProc.new('Bio::FastaFormat',
488                                'Bio::NBRF',
489                                'Bio::FastaNumericFormat') do |text|
490       if /^>.+$/ =~ text
491         case text
492         when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/
493           Bio::NBRF
494         when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/
495             Bio::FastaFormat
496         when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/
497           Bio::FastaNumericFormat
498         else
499           false
500         end
501       else
502         nil
503       end
504     end
505   ]
506 
507   # dependencies
508   # NCBI
509   genbank.is_prior_to genpept
510   # EMBL/UniProt
511   embl.is_prior_to sptr
512   sptr.is_prior_to prosite
513   prosite.is_prior_to transfac
514   # KEGG
515   #aaindex.is_prior_to litdb
516   #litdb.is_prior_to brite
517   pathway_module.is_prior_to pathway
518   pathway.is_prior_to brite
519   brite.is_prior_to orthology
520   orthology.is_prior_to drug
521   drug.is_prior_to glycan
522   glycan.is_prior_to enzyme
523   enzyme.is_prior_to compound
524   compound.is_prior_to reaction
525   reaction.is_prior_to genes
526   genes.is_prior_to genome
527   # PDB
528   pdb.is_prior_to het
529   # BLAST
530   wublast.is_prior_to wutblast
531   wutblast.is_prior_to blast
532   blast.is_prior_to tblast
533   # Fastq
534   BottomRule.is_prior_to(fastq)
535   fastq.is_prior_to(fastaformat)
536   # FastaFormat
537   BottomRule.is_prior_to(fastaformat)
538 
539   # for debug
540   #debug_first = RuleDebug.new('debug_first')
541   #a.add(debug_first)
542   #debug_first.is_prior_to(TopRule)
543 
544   ## for debug
545   #debug_last = RuleDebug.new('debug_last')
546   #a.add(debug_last)
547   #BottomRule.is_prior_to(debug_last)
548   #fastaformat.is_prior_to(debug_last)
549 
550   ## for suppressing warnings
551   p medline, aaindex, litdb, fantom, clustal,
552     gcg_msf, gcg_seq, blastxml, rpsblast, blat,
553     spidey, hmmer, sim4 if false
554 
555   a.rehash
556   return a
557 end
new() click to toggle source

Creates a new Autodetect object

    # File lib/bio/io/flatfile/autodetection.rb
226 def initialize
227   # stores autodetection rules.
228   @rules = Hash.new
229   # stores elements (cache)
230   @elements = nil
231   self.add(TopRule)
232   self.add(BottomRule)
233 end

Public Instance Methods

add(elem) click to toggle source

Adds a new element. Returns elem.

    # File lib/bio/io/flatfile/autodetection.rb
237 def add(elem)
238   raise 'element name conflicts' if @rules[elem.name]
239   @elements = nil
240   @rules[elem.name] = elem
241   elem
242 end
autodetect(text, meta = {}) click to toggle source

Autodetect from the text. Returns a database class if succeeded. Returns nil if failed.

    # File lib/bio/io/flatfile/autodetection.rb
305 def autodetect(text, meta = {})
306   r = nil
307   elements.each do |e|
308     #$stderr.puts e.name
309     r = e.guess(text, meta)
310     break if r
311   end
312   r
313 end
autodetect_flatfile(ff, lines = 31) click to toggle source

autodetect from the FlatFile object. Returns a database class if succeeded. Returns nil if failed.

    # File lib/bio/io/flatfile/autodetection.rb
318 def autodetect_flatfile(ff, lines = 31)
319   meta = {}
320   stream = ff.instance_eval { @stream }
321   begin
322     path = stream.path
323   rescue NameError
324   end
325   if path then
326     meta[:path] = path
327     # call autodetect onece with meta and without any read action
328     if r = self.autodetect(stream.prefetch_buffer, meta)
329       return r
330     end
331   end
332   # reading stream
333   1.upto(lines) do |x|
334     break unless line = stream.prefetch_gets
335     if line.strip.size > 0 then
336       if r = self.autodetect(stream.prefetch_buffer, meta)
337         return r
338       end
339     end
340   end
341   return nil
342 end
each_rule() { |elem| ... } click to toggle source

Iterates over each element.

    # File lib/bio/io/flatfile/autodetection.rb
298 def each_rule(&x) #:yields: elem
299   elements.each(&x)
300 end
elements() click to toggle source

Returns current elements as an array whose order fulfills all elements' priorities.

    # File lib/bio/io/flatfile/autodetection.rb
275 def elements
276   unless @elements
277     ary = tsort
278     ary.reverse!
279     @elements = ary
280   end
281   @elements
282 end
inspect() click to toggle source

visualizes the object (mainly for debug)

    # File lib/bio/io/flatfile/autodetection.rb
291 def inspect
292   "<#{self.class.to_s} " +
293     self.elements.collect { |e| e.name.inspect }.join(' ') +
294     ">"
295 end
rehash() click to toggle source

rebuilds the object and clears internal cache.

    # File lib/bio/io/flatfile/autodetection.rb
285 def rehash
286   @rules.rehash
287   @elements = nil
288 end
tsort_each_child(elem) { |e| ... } click to toggle source

(required by TSort.) For a given element, yields each child (= lower priority elements) of the element.

    # File lib/bio/io/flatfile/autodetection.rb
253 def tsort_each_child(elem)
254   if elem == TopRule then
255     @rules.each_value do |e|
256       yield e unless e == TopRule or 
257         e.lower_priority_elements.index(TopRule)
258     end
259   elsif elem == BottomRule then
260     @rules.each_value do |e|
261       yield e if e.higher_priority_elements.index(BottomRule)
262     end
263   else
264     elem.lower_priority_elements.each do |e|
265       yield e if e != BottomRule
266     end
267     unless elem.higher_priority_elements.index(BottomRule)
268       yield BottomRule
269     end
270   end
271 end
tsort_each_node(&x) click to toggle source

(required by TSort.) For all elements, yields each element.

    # File lib/bio/io/flatfile/autodetection.rb
246 def tsort_each_node(&x)
247   @rules.each_value(&x)
248 end