class Bio::REBASE
bio/db/rebase.rb - Interface for EMBOSS
formatted REBASE
files
- Author
-
Trevor Wennblom <trevor@corevx.com>
- Copyright
-
Copyright © 2005-2007 Midwinter Laboratories, LLC (midwinterlabs.com)
- License
-
The Ruby License
Description¶ ↑
Bio::REBASE
provides utilties for interacting with REBASE
data in EMBOSS
format. REBASE
is the Restriction Enzyme Database, more information can be found here:
EMBOSS
formatted files located at:
These files are the same as the “emboss_?.???” files located at:
To easily get started with the data you can simply type this command at your shell prompt:
% wget "ftp://ftp.neb.com/pub/rebase/emboss_*"
Usage¶ ↑
require 'bio' require 'pp' enz = File.read('emboss_e') ref = File.read('emboss_r') sup = File.read('emboss_s') # When creating a new instance of Bio::REBASE # the contents of the enzyme file must be passed. # The references and suppiers file contents # may also be passed. rebase = Bio::REBASE.new( enz ) rebase = Bio::REBASE.new( enz, ref ) rebase = Bio::REBASE.new( enz, ref, sup ) # The 'read' class method allows you to read in files # that are REBASE EMBOSS formatted rebase = Bio::REBASE.read( 'emboss_e' ) rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' ) rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' ) # The data loaded may be saved in YAML format rebase.save_yaml( 'enz.yaml' ) rebase.save_yaml( 'enz.yaml', 'ref.yaml' ) rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' ) # YAML formatted files can also be read with the # class method 'load_yaml' rebase = Bio::REBASE.load_yaml( 'enz.yaml' ) rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' ) rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' ) pp rebase.enzymes[0..4] # ["AarI", "AasI", "AatI", "AatII", "Acc16I"] pp rebase.enzyme_name?('aasi') # true pp rebase['AarI'].pattern # "CACCTGC" pp rebase['AarI'].blunt? # false pp rebase['AarI'].organism # "Arthrobacter aurescens SS2-322" pp rebase['AarI'].source # "A. Janulaitis" pp rebase['AarI'].primary_strand_cut1 # 11 pp rebase['AarI'].primary_strand_cut2 # 0 pp rebase['AarI'].complementary_strand_cut1 # 15 pp rebase['AarI'].complementary_strand_cut2 # 0 pp rebase['AarI'].suppliers # ["F"] pp rebase['AarI'].supplier_names # ["Fermentas International Inc."] pp rebase['AarI'].isoschizomers # Currently none stored in the references file pp rebase['AarI'].methylation # "" pp rebase['EcoRII'].methylation # "2(5)" pp rebase['EcoRII'].suppliers # ["F", "J", "M", "O", "S"] pp rebase['EcoRII'].supplier_names # ["Fermentas International Inc.", "Nippon Gene Co., Ltd.", # "Roche Applied Science", "Toyobo Biochemicals", # "Sigma Chemical Corporation"] # Number of enzymes in the database pp rebase.size # 673 pp rebase.enzymes.size # 673 rebase.each do |name, info| pp "#{name}: #{info.methylation}" unless info.methylation.empty? end
Public Class Methods
Read YAML formatted files
rebase = Bio::REBASE.load_yaml( 'enz.yaml' ) rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' ) rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
Arguments
-
f_enzyme
: (required) Filename to read YAML-formatted enzyme data -
f_reference
: (optional) Filename to read YAML-formatted reference data -
f_supplier
: (optional) Filename to read YAML-formatted supplier data
- Returns
-
Bio::REBASE
object
# File lib/bio/db/rebase.rb 258 def self.load_yaml( f_enzyme, f_reference=nil, f_supplier=nil ) 259 e = YAML.load_file(f_enzyme) 260 r = f_reference ? YAML.load_file(f_reference) : nil 261 s = f_supplier ? YAML.load_file(f_supplier) : nil 262 self.new(e,r,s,true) 263 end
Constructor
Arguments
-
enzyme_lines
: (required) contents ofEMBOSS
formatted enzymes file -
reference_lines
: (optional) contents ofEMBOSS
formatted references file -
supplier_lines
: (optional) contents ofEMBOSS
formatted suppliers files -
yaml
: (optional, defaultfalse
) enzyme_lines, reference_lines, and supplier_lines are read as YAML if set to true
- Returns
# File lib/bio/db/rebase.rb 174 def initialize( enzyme_lines, reference_lines = nil, supplier_lines = nil, yaml = false ) 175 # All your REBASE are belong to us. 176 177 if yaml 178 @enzyme_data = enzyme_lines 179 @reference_data = reference_lines 180 @supplier_data = supplier_lines 181 else 182 @enzyme_data = parse_enzymes(enzyme_lines) 183 @reference_data = parse_references(reference_lines) 184 @supplier_data = parse_suppliers(supplier_lines) 185 end 186 187 EnzymeEntry.supplier_data = @supplier_data 188 setup_enzyme_data 189 end
Read REBASE
EMBOSS-formatted files
rebase = Bio::REBASE.read( 'emboss_e' ) rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' ) rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' )
Arguments
-
f_enzyme
: (required) Filename to read enzyme data -
f_reference
: (optional) Filename to read reference data -
f_supplier
: (optional) Filename to read supplier data
- Returns
-
Bio::REBASE
object
# File lib/bio/db/rebase.rb 240 def self.read( f_enzyme, f_reference=nil, f_supplier=nil ) 241 e = IO.readlines(f_enzyme) 242 r = f_reference ? IO.readlines(f_reference) : nil 243 s = f_supplier ? IO.readlines(f_supplier) : nil 244 self.new(e,r,s) 245 end
Public Instance Methods
Calls block once for each element in @data
hash, passing that element as a parameter.
Arguments
-
Accepts a block
- Returns
-
results of block operations
# File lib/bio/db/rebase.rb 150 def each 151 @data.each { |item| yield item } 152 end
Check if supplied name is the name of an available enzyme
Arguments
-
name
: Enzyme name
- Returns
-
true/false
# File lib/bio/db/rebase.rb 207 def enzyme_name?(name) 208 @enzyme_names_downcased.include?(name.downcase) 209 end
List the enzymes available
Arguments
-
none
- Returns
-
Array
sorted enzyme names
# File lib/bio/db/rebase.rb 197 def enzymes 198 @enzyme_names 199 end
Save the current data
rebase.save_yaml( 'enz.yaml' ) rebase.save_yaml( 'enz.yaml', 'ref.yaml' ) rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
Arguments
-
f_enzyme
: (required) Filename to save YAML formatted output of enzyme data -
f_reference
: (optional) Filename to save YAML formatted output of reference data -
f_supplier
: (optional) Filename to save YAML formatted output of supplier data
- Returns
-
nothing
# File lib/bio/db/rebase.rb 222 def save_yaml( f_enzyme, f_reference=nil, f_supplier=nil ) 223 File.open(f_enzyme, 'w') { |f| f.puts YAML.dump(@enzyme_data) } 224 File.open(f_reference, 'w') { |f| f.puts YAML.dump(@reference_data) } if f_reference 225 File.open(f_supplier, 'w') { |f| f.puts YAML.dump(@supplier_data) } if f_supplier 226 return 227 end
Protected Instance Methods
data is a hash indexed by the :name of each entry which is also a hash
-
data has the following keys: :name, :pattern, :len, :ncuts, :blunt, :c1, :c2, :c3, :c4 :c1 => First 5’ cut :c2 => First 3’ cut :c3 => Second 5’ cut :c4 => Seocnd 3’ cut
# File lib/bio/db/rebase.rb 314 def parse_enzymes( lines ) 315 data = {} 316 return data if lines == nil 317 lines.each_line do |line| 318 next if line[0].chr == '#' 319 line.chomp! 320 321 a = line.split("\s") 322 323 data[ a[0] ] = { 324 :name => a[0], 325 :pattern => a[1], 326 :len => a[2], 327 :ncuts => a[3], 328 :blunt => a[4], 329 :c1 => a[5], 330 :c2 => a[6], 331 :c3 => a[7], 332 :c4 => a[8] 333 } 334 end # lines.each 335 data 336 end
data is a hash indexed by the :name of each entry which is also a hash
-
data has the following keys: :organism, :isoschizomers, :references, :source, :methylation, :suppliers, :name, :number_of_references
# File lib/bio/db/rebase.rb 341 def parse_references( lines ) 342 data = {} 343 return data if lines == nil 344 index = 1 345 h = {} 346 references_left = 0 347 348 lines.each_line do |line| 349 next if line[0].chr == '#' # Comment 350 next if line[0..1] == '//' # End of entry marker 351 line.chomp! 352 353 if (1..7).include?( index ) 354 h[index] = line 355 references_left = h[index].to_i if index == 7 356 index += 1 357 next 358 end 359 360 if index == 8 361 h[index] ||= [] 362 h[index] << line 363 references_left -= 1 364 end 365 366 if references_left == 0 367 data[ h[1] ] = { 368 :name => h[1], 369 :organism => h[2], 370 :isoschizomers => h[3], 371 :methylation => h[4], 372 :source => h[5], 373 :suppliers => h[6], 374 :number_of_references => h[7], 375 :references => h[8] 376 } 377 index = 1 378 h = {} 379 end 380 end # lines.each 381 data 382 end
data is a hash indexed by the supplier code
data[supplier_code] returns the suppliers name
# File lib/bio/db/rebase.rb 387 def parse_suppliers( lines ) 388 data = {} 389 return data if lines == nil 390 lines.each_line do |line| 391 next if line[0].chr == '#' 392 data[$1] = $2 if line =~ %r{(.+?)\s(.+)} 393 end 394 data 395 end
Takes a string in one of the three formats listed below and returns a Bio::Reference
object
-
Possible input styles: a = ‘Inagaki, K., Hikita, T., Yanagidani, S., Nomura, Y., Kishimoto, N., Tano, T., Tanaka, H., (1993) Biosci. Biotechnol. Biochem., vol. 57, pp. 1716-1721.’ b = ‘Nekrasiene, D., Lapcinskaja, S., Kiuduliene, L., Vitkute, J., Janulaitis, A., Unpublished observations.’ c = “Grigaite, R., Maneliene, Z., Janulaitis, A., (2002) Nucleic Acids Res., vol. 30.”
# File lib/bio/db/rebase.rb 403 def raw_to_reference( line ) 404 a = line.split(', ') 405 406 if a[-1] == 'Unpublished observations.' 407 title = a.pop.chop 408 pages = volume = year = journal = '' 409 else 410 title = '' 411 412 pages_or_volume = a.pop.chop 413 if pages_or_volume =~ %r{pp\.\s} 414 pages = pages_or_volume 415 pages.gsub!('pp. ', '') 416 volume = a.pop 417 else 418 pages = '' 419 volume = pages_or_volume 420 end 421 422 volume.gsub!('vol. ', '') 423 424 year_and_journal = a.pop 425 year_and_journal =~ %r{\((\d+)\)\s(.+)} 426 year = $1 427 journal = $2 428 end 429 430 authors = [] 431 432 last_name = nil 433 a.each do |e| 434 if last_name 435 authors << "#{last_name}, #{e}" 436 last_name = nil 437 else 438 last_name = e 439 end 440 end 441 442 ref = { 443 'title' => title, 444 'pages' => pages, 445 'volume' => volume, 446 'year' => year, 447 'journal' => journal, 448 'authors' => authors, 449 } 450 451 Bio::Reference.new(ref) 452 end
# File lib/bio/db/rebase.rb 295 def setup_enzyme_and_reference_association 296 return unless @reference_data 297 @reference_data.each do |name, hash| 298 d = @data[name] 299 [:organism, :isoschizomers, 300 :methylation, :source].each { |k| d[k] = hash[k] } 301 d.suppliers = hash[:suppliers].split('') 302 d.references = [] 303 hash[:references].each { |k| d.references << raw_to_reference(k) } 304 end 305 end
# File lib/bio/db/rebase.rb 269 def setup_enzyme_data 270 @data = {} 271 272 @enzyme_data.each do |name, hash| 273 @data[name] = EnzymeEntry.new 274 d = @data[name] 275 d.pattern = hash[:pattern] 276 # d.blunt?= is a syntax error 277 d[:blunt?] = (hash[:blunt].to_i == 1 ? true : false) 278 d.primary_strand_cut1 = hash[:c1].to_i 279 d.complementary_strand_cut1 = hash[:c2].to_i 280 d.primary_strand_cut2 = hash[:c3].to_i 281 d.complementary_strand_cut2 = hash[:c4].to_i 282 283 # Set up keys just in case there's no reference data supplied 284 [:organism, :isoschizomers, 285 :methylation, :source].each { |k| d[k] = '' } 286 d.suppliers = [] 287 d.references = [] 288 end 289 290 @enzyme_names = @data.keys.sort 291 @enzyme_names_downcased = @enzyme_names.map{|a| a.downcase} 292 setup_enzyme_and_reference_association 293 end