class Bio::FlatFile::AutoDetect
AutoDetect
automatically determines database class of given data.
Constants
- BottomRule
Special element that is always bottom priority.
- TopRule
Special element that is always top priority.
Public Class Methods
make a new autodetect object
# File lib/bio/io/flatfile/autodetection.rb 361 def self.[](*arg) 362 a = self.new 363 arg.each { |e| a.add(e) } 364 a 365 end
returns the default autodetect object
# File lib/bio/io/flatfile/autodetection.rb 348 def self.default 349 unless @default then 350 @default = self.make_default 351 end 352 @default 353 end
sets the default autodetect object.
# File lib/bio/io/flatfile/autodetection.rb 356 def self.default=(ad) 357 @default = ad 358 end
make a default of default autodetect object
# File lib/bio/io/flatfile/autodetection.rb 368 def self.make_default 369 a = self[ 370 genbank = RuleRegexp[ 'Bio::GenBank', 371 /^LOCUS .+ bp .*[a-z]*[DR]?NA/ ], 372 genpept = RuleRegexp[ 'Bio::GenPept', 373 /^LOCUS .+ aa .+/ ], 374 medline = RuleRegexp[ 'Bio::MEDLINE', 375 /^PMID\- [0-9]+$/ ], 376 embl = RuleRegexp[ 'Bio::EMBL', 377 /^ID .+\; .*(DNA|RNA|XXX)\;/ ], 378 sptr = RuleRegexp2[ 'Bio::SPTR', 379 /^ID .+\; *PRT\;/, 380 /^ID [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ], 381 prosite = RuleRegexp[ 'Bio::PROSITE', 382 /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ], 383 transfac = RuleRegexp[ 'Bio::TRANSFAC', 384 /^AC [-A-Za-z0-9_\.]+$/ ], 385 386 aaindex = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text| 387 if /^H [-A-Z0-9_\.]+$/ =~ text then 388 if text =~ /^M [rc]/ then 389 Bio::AAindex2 390 elsif text =~ /^I A\/L/ then 391 Bio::AAindex1 392 else 393 false #fail to determine 394 end 395 else 396 nil 397 end 398 end, 399 400 litdb = RuleRegexp[ 'Bio::LITDB', 401 /^CODE [0-9]+$/ ], 402 pathway_module = RuleRegexp[ 'Bio::KEGG::MODULE', 403 /^ENTRY .+ Pathway\s+Module\s*/ ], 404 pathway = RuleRegexp[ 'Bio::KEGG::PATHWAY', 405 /^ENTRY .+ Pathway\s*/ ], 406 brite = RuleRegexp[ 'Bio::KEGG::BRITE', 407 /^Entry [A-Z0-9]+/ ], 408 orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY', 409 /^ENTRY .+ KO\s*/ ], 410 drug = RuleRegexp[ 'Bio::KEGG::DRUG', 411 /^ENTRY .+ Drug\s*/ ], 412 glycan = RuleRegexp[ 'Bio::KEGG::GLYCAN', 413 /^ENTRY .+ Glycan\s*/ ], 414 enzyme = RuleRegexp2[ 'Bio::KEGG::ENZYME', 415 /^ENTRY EC [0-9\.]+$/, 416 /^ENTRY .+ Enzyme\s*/ 417 ], 418 compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND', 419 /^ENTRY C[A-Za-z0-9\._]+$/, 420 /^ENTRY .+ Compound\s*/ 421 ], 422 reaction = RuleRegexp2[ 'Bio::KEGG::REACTION', 423 /^ENTRY R[A-Za-z0-9\._]+$/, 424 /^ENTRY .+ Reaction\s*/ 425 ], 426 genes = RuleRegexp[ 'Bio::KEGG::GENES', 427 /^ENTRY .+ (CDS|gene|.*RNA|Contig) / ], 428 genome = RuleRegexp[ 'Bio::KEGG::GENOME', 429 /^ENTRY [a-z]+$/ ], 430 431 fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster', 432 'Bio::FANTOM::MaXML::Sequence') do |text| 433 if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text 434 case $1 435 when 'clusters' 436 Bio::FANTOM::MaXML::Cluster 437 when 'sequences' 438 Bio::FANTOM::MaXML::Sequence 439 else 440 nil #unknown 441 end 442 else 443 nil 444 end 445 end, 446 447 pdb = RuleRegexp[ 'Bio::PDB', 448 /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/ ], 449 het = RuleRegexp[ 'Bio::PDB::ChemicalComponent', 450 /^RESIDUE +.+ +\d+\s*$/ ], 451 452 clustal = RuleRegexp2[ 'Bio::ClustalW::Report', 453 /^CLUSTAL .*\(.*\).*sequence +alignment/, 454 /^CLUSTAL FORMAT for T-COFFEE/ ], 455 456 gcg_msf = RuleRegexp[ 'Bio::GCG::Msf', 457 /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ], 458 459 gcg_seq = RuleRegexp[ 'Bio::GCG::Seq', 460 /^!!(N|A)A_SEQUENCE .+/ ], 461 462 blastxml = RuleRegexp[ 'Bio::Blast::Report', 463 /\<\!DOCTYPE BlastOutput PUBLIC / ], 464 wublast = RuleRegexp[ 'Bio::Blast::WU::Report', 465 /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ], 466 wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast', 467 /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ], 468 blast = RuleRegexp[ 'Bio::Blast::Default::Report', 469 /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], 470 tblast = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast', 471 /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], 472 rpsblast = RuleRegexp[ 'Bio::Blast::RPSBlast::Report', 473 /^RPS\-BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], 474 475 blat = RuleRegexp[ 'Bio::Blat::Report', 476 /^psLayout version \d+/ ], 477 spidey = RuleRegexp[ 'Bio::Spidey::Report', 478 /^\-\-SPIDEY version .+\-\-$/ ], 479 hmmer = RuleRegexp[ 'Bio::HMMER::Report', 480 /^HMMER +\d+\./ ], 481 sim4 = RuleRegexp[ 'Bio::Sim4::Report', 482 /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ], 483 484 fastq = RuleRegexp[ 'Bio::Fastq', 485 /^\@.+(?:\r|\r?\n)(?:[^\@\+].*(?:\r|\r?\n))+/ ], 486 487 fastaformat = RuleProc.new('Bio::FastaFormat', 488 'Bio::NBRF', 489 'Bio::FastaNumericFormat') do |text| 490 if /^>.+$/ =~ text 491 case text 492 when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/ 493 Bio::NBRF 494 when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/ 495 Bio::FastaFormat 496 when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/ 497 Bio::FastaNumericFormat 498 else 499 false 500 end 501 else 502 nil 503 end 504 end 505 ] 506 507 # dependencies 508 # NCBI 509 genbank.is_prior_to genpept 510 # EMBL/UniProt 511 embl.is_prior_to sptr 512 sptr.is_prior_to prosite 513 prosite.is_prior_to transfac 514 # KEGG 515 #aaindex.is_prior_to litdb 516 #litdb.is_prior_to brite 517 pathway_module.is_prior_to pathway 518 pathway.is_prior_to brite 519 brite.is_prior_to orthology 520 orthology.is_prior_to drug 521 drug.is_prior_to glycan 522 glycan.is_prior_to enzyme 523 enzyme.is_prior_to compound 524 compound.is_prior_to reaction 525 reaction.is_prior_to genes 526 genes.is_prior_to genome 527 # PDB 528 pdb.is_prior_to het 529 # BLAST 530 wublast.is_prior_to wutblast 531 wutblast.is_prior_to blast 532 blast.is_prior_to tblast 533 # Fastq 534 BottomRule.is_prior_to(fastq) 535 fastq.is_prior_to(fastaformat) 536 # FastaFormat 537 BottomRule.is_prior_to(fastaformat) 538 539 # for debug 540 #debug_first = RuleDebug.new('debug_first') 541 #a.add(debug_first) 542 #debug_first.is_prior_to(TopRule) 543 544 ## for debug 545 #debug_last = RuleDebug.new('debug_last') 546 #a.add(debug_last) 547 #BottomRule.is_prior_to(debug_last) 548 #fastaformat.is_prior_to(debug_last) 549 550 ## for suppressing warnings 551 p medline, aaindex, litdb, fantom, clustal, 552 gcg_msf, gcg_seq, blastxml, rpsblast, blat, 553 spidey, hmmer, sim4 if false 554 555 a.rehash 556 return a 557 end
Creates a new Autodetect object
# File lib/bio/io/flatfile/autodetection.rb 226 def initialize 227 # stores autodetection rules. 228 @rules = Hash.new 229 # stores elements (cache) 230 @elements = nil 231 self.add(TopRule) 232 self.add(BottomRule) 233 end
Public Instance Methods
Adds a new element. Returns elem.
# File lib/bio/io/flatfile/autodetection.rb 237 def add(elem) 238 raise 'element name conflicts' if @rules[elem.name] 239 @elements = nil 240 @rules[elem.name] = elem 241 elem 242 end
Autodetect from the text. Returns a database class if succeeded. Returns nil if failed.
# File lib/bio/io/flatfile/autodetection.rb 305 def autodetect(text, meta = {}) 306 r = nil 307 elements.each do |e| 308 #$stderr.puts e.name 309 r = e.guess(text, meta) 310 break if r 311 end 312 r 313 end
autodetect from the FlatFile
object. Returns a database class if succeeded. Returns nil if failed.
# File lib/bio/io/flatfile/autodetection.rb 318 def autodetect_flatfile(ff, lines = 31) 319 meta = {} 320 stream = ff.instance_eval { @stream } 321 begin 322 path = stream.path 323 rescue NameError 324 end 325 if path then 326 meta[:path] = path 327 # call autodetect onece with meta and without any read action 328 if r = self.autodetect(stream.prefetch_buffer, meta) 329 return r 330 end 331 end 332 # reading stream 333 1.upto(lines) do |x| 334 break unless line = stream.prefetch_gets 335 if line.strip.size > 0 then 336 if r = self.autodetect(stream.prefetch_buffer, meta) 337 return r 338 end 339 end 340 end 341 return nil 342 end
Iterates over each element.
# File lib/bio/io/flatfile/autodetection.rb 298 def each_rule(&x) #:yields: elem 299 elements.each(&x) 300 end
Returns current elements as an array whose order fulfills all elements’ priorities.
# File lib/bio/io/flatfile/autodetection.rb 275 def elements 276 unless @elements 277 ary = tsort 278 ary.reverse! 279 @elements = ary 280 end 281 @elements 282 end
visualizes the object (mainly for debug)
# File lib/bio/io/flatfile/autodetection.rb 291 def inspect 292 "<#{self.class.to_s} " + 293 self.elements.collect { |e| e.name.inspect }.join(' ') + 294 ">" 295 end
rebuilds the object and clears internal cache.
# File lib/bio/io/flatfile/autodetection.rb 285 def rehash 286 @rules.rehash 287 @elements = nil 288 end
(required by TSort.) For a given element, yields each child (= lower priority elements) of the element.
# File lib/bio/io/flatfile/autodetection.rb 253 def tsort_each_child(elem) 254 if elem == TopRule then 255 @rules.each_value do |e| 256 yield e unless e == TopRule or 257 e.lower_priority_elements.index(TopRule) 258 end 259 elsif elem == BottomRule then 260 @rules.each_value do |e| 261 yield e if e.higher_priority_elements.index(BottomRule) 262 end 263 else 264 elem.lower_priority_elements.each do |e| 265 yield e if e != BottomRule 266 end 267 unless elem.higher_priority_elements.index(BottomRule) 268 yield BottomRule 269 end 270 end 271 end
(required by TSort.) For all elements, yields each element.
# File lib/bio/io/flatfile/autodetection.rb 246 def tsort_each_node(&x) 247 @rules.each_value(&x) 248 end