class Bio::REBASE

bio/db/rebase.rb - Interface for EMBOSS formatted REBASE files

Author

Trevor Wennblom <trevor@corevx.com>

Copyright

Copyright © 2005-2007 Midwinter Laboratories, LLC (midwinterlabs.com)

License

The Ruby License

Description

Bio::REBASE provides utilties for interacting with REBASE data in EMBOSS format. REBASE is the Restriction Enzyme Database, more information can be found here:

EMBOSS formatted files located at:

These files are the same as the “emboss_?.???” files located at:

To easily get started with the data you can simply type this command at your shell prompt:

% wget "ftp://ftp.neb.com/pub/rebase/emboss_*"

Usage

require 'bio'
require 'pp'

enz = File.read('emboss_e')
ref = File.read('emboss_r')
sup = File.read('emboss_s')

# When creating a new instance of Bio::REBASE
# the contents of the enzyme file must be passed.
# The references and suppiers file contents
# may also be passed.
rebase = Bio::REBASE.new( enz )
rebase = Bio::REBASE.new( enz, ref )
rebase = Bio::REBASE.new( enz, ref, sup )

# The 'read' class method allows you to read in files
# that are REBASE EMBOSS formatted
rebase = Bio::REBASE.read( 'emboss_e' )
rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' )
rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' )

# The data loaded may be saved in YAML format
rebase.save_yaml( 'enz.yaml' )
rebase.save_yaml( 'enz.yaml', 'ref.yaml' )
rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )

# YAML formatted files can also be read with the
# class method 'load_yaml'
rebase = Bio::REBASE.load_yaml( 'enz.yaml' )
rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' )
rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )

pp rebase.enzymes[0..4]                     # ["AarI", "AasI", "AatI", "AatII", "Acc16I"]
pp rebase.enzyme_name?('aasi')              # true
pp rebase['AarI'].pattern                   # "CACCTGC"
pp rebase['AarI'].blunt?                    # false
pp rebase['AarI'].organism                  # "Arthrobacter aurescens SS2-322"
pp rebase['AarI'].source                    # "A. Janulaitis"
pp rebase['AarI'].primary_strand_cut1       # 11
pp rebase['AarI'].primary_strand_cut2       # 0
pp rebase['AarI'].complementary_strand_cut1 # 15
pp rebase['AarI'].complementary_strand_cut2 # 0
pp rebase['AarI'].suppliers                 # ["F"]
pp rebase['AarI'].supplier_names            # ["Fermentas International Inc."]

pp rebase['AarI'].isoschizomers             # Currently none stored in the references file
pp rebase['AarI'].methylation               # ""

pp rebase['EcoRII'].methylation             # "2(5)"
pp rebase['EcoRII'].suppliers               # ["F", "J", "M", "O", "S"]
pp rebase['EcoRII'].supplier_names  # ["Fermentas International Inc.", "Nippon Gene Co., Ltd.",
                                    # "Roche Applied Science", "Toyobo Biochemicals",
                                    # "Sigma Chemical Corporation"]

# Number of enzymes in the database
pp rebase.size                              # 673
pp rebase.enzymes.size                      # 673

rebase.each do |name, info|
  pp "#{name}:  #{info.methylation}" unless info.methylation.empty?
end

Public Class Methods

load_yaml( f_enzyme, f_reference=nil, f_supplier=nil ) click to toggle source

Read YAML formatted files

rebase = Bio::REBASE.load_yaml( 'enz.yaml' )
rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' )
rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )

Arguments

  • f_enzyme: (required) Filename to read YAML-formatted enzyme data

  • f_reference: (optional) Filename to read YAML-formatted reference data

  • f_supplier: (optional) Filename to read YAML-formatted supplier data

Returns

Bio::REBASE object

    # File lib/bio/db/rebase.rb
258 def self.load_yaml( f_enzyme, f_reference=nil, f_supplier=nil )
259   e = YAML.load_file(f_enzyme)
260   r = f_reference ? YAML.load_file(f_reference) : nil
261   s = f_supplier ? YAML.load_file(f_supplier) : nil
262   self.new(e,r,s,true)
263 end
new( enzyme_lines, reference_lines = nil, supplier_lines = nil, yaml = false ) click to toggle source

Constructor


Arguments

  • enzyme_lines: (required) contents of EMBOSS formatted enzymes file

  • reference_lines: (optional) contents of EMBOSS formatted references file

  • supplier_lines: (optional) contents of EMBOSS formatted suppliers files

  • yaml: (optional, default false) enzyme_lines, reference_lines, and supplier_lines are read as YAML if set to true

Returns

Bio::REBASE

    # File lib/bio/db/rebase.rb
174 def initialize( enzyme_lines, reference_lines = nil, supplier_lines = nil, yaml = false )
175   # All your REBASE are belong to us.
176 
177   if yaml
178     @enzyme_data = enzyme_lines
179     @reference_data = reference_lines
180     @supplier_data = supplier_lines
181   else
182     @enzyme_data = parse_enzymes(enzyme_lines)
183     @reference_data = parse_references(reference_lines)
184     @supplier_data = parse_suppliers(supplier_lines)
185   end
186 
187   EnzymeEntry.supplier_data = @supplier_data
188   setup_enzyme_data
189 end
read( f_enzyme, f_reference=nil, f_supplier=nil ) click to toggle source

Read REBASE EMBOSS-formatted files

rebase = Bio::REBASE.read( 'emboss_e' )
rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' )
rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' )

Arguments

  • f_enzyme: (required) Filename to read enzyme data

  • f_reference: (optional) Filename to read reference data

  • f_supplier: (optional) Filename to read supplier data

Returns

Bio::REBASE object

    # File lib/bio/db/rebase.rb
240 def self.read( f_enzyme, f_reference=nil, f_supplier=nil )
241   e = IO.readlines(f_enzyme)
242   r = f_reference ? IO.readlines(f_reference) : nil
243   s = f_supplier ? IO.readlines(f_supplier) : nil
244   self.new(e,r,s)
245 end

Public Instance Methods

each() { |item| ... } click to toggle source

Calls block once for each element in @data hash, passing that element as a parameter.


Arguments

  • Accepts a block

Returns

results of block operations

    # File lib/bio/db/rebase.rb
150 def each
151   @data.each { |item| yield item }
152 end
enzyme_name?(name) click to toggle source

Check if supplied name is the name of an available enzyme


Arguments

  • name: Enzyme name

Returns

true/false

    # File lib/bio/db/rebase.rb
207 def enzyme_name?(name)
208   @enzyme_names_downcased.include?(name.downcase)
209 end
enzymes() click to toggle source

List the enzymes available


Arguments

  • none

Returns

Array sorted enzyme names

    # File lib/bio/db/rebase.rb
197 def enzymes
198   @enzyme_names
199 end
save_yaml( f_enzyme, f_reference=nil, f_supplier=nil ) click to toggle source

Save the current data

rebase.save_yaml( 'enz.yaml' )
rebase.save_yaml( 'enz.yaml', 'ref.yaml' )
rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )

Arguments

  • f_enzyme: (required) Filename to save YAML formatted output of enzyme data

  • f_reference: (optional) Filename to save YAML formatted output of reference data

  • f_supplier: (optional) Filename to save YAML formatted output of supplier data

Returns

nothing

    # File lib/bio/db/rebase.rb
222 def save_yaml( f_enzyme, f_reference=nil, f_supplier=nil )
223   File.open(f_enzyme, 'w') { |f| f.puts YAML.dump(@enzyme_data) }
224   File.open(f_reference, 'w') { |f| f.puts YAML.dump(@reference_data) } if f_reference
225   File.open(f_supplier, 'w') { |f| f.puts YAML.dump(@supplier_data) } if f_supplier
226   return
227 end

Protected Instance Methods

parse_enzymes( lines ) click to toggle source

data is a hash indexed by the :name of each entry which is also a hash

  • data has the following keys: :name, :pattern, :len, :ncuts, :blunt, :c1, :c2, :c3, :c4 :c1 => First 5’ cut :c2 => First 3’ cut :c3 => Second 5’ cut :c4 => Seocnd 3’ cut

    # File lib/bio/db/rebase.rb
314 def parse_enzymes( lines )
315   data = {}
316   return data if lines == nil
317   lines.each_line do |line|
318     next if line[0].chr == '#'
319     line.chomp!
320     
321     a = line.split("\s")
322     
323     data[ a[0] ] = {
324       :name => a[0],
325       :pattern => a[1],
326       :len => a[2],
327       :ncuts => a[3],
328       :blunt => a[4],
329       :c1 => a[5],
330       :c2 => a[6],
331       :c3 => a[7],
332       :c4 => a[8]
333     }
334   end  # lines.each
335   data
336 end
parse_references( lines ) click to toggle source

data is a hash indexed by the :name of each entry which is also a hash

  • data has the following keys: :organism, :isoschizomers, :references, :source, :methylation, :suppliers, :name, :number_of_references

    # File lib/bio/db/rebase.rb
341 def parse_references( lines )
342   data = {}
343   return data if lines == nil
344   index = 1
345   h = {}
346   references_left = 0
347 
348   lines.each_line do |line|
349     next if line[0].chr == '#'  # Comment
350     next if line[0..1] == '//'  # End of entry marker
351     line.chomp!
352 
353     if (1..7).include?( index )
354       h[index] = line
355       references_left = h[index].to_i if index == 7
356       index += 1
357       next
358     end
359 
360     if index == 8
361       h[index] ||= []
362       h[index] << line
363       references_left -= 1
364     end
365 
366     if references_left == 0
367       data[ h[1] ] = {
368         :name => h[1],
369         :organism => h[2],
370         :isoschizomers => h[3],
371         :methylation => h[4],
372         :source => h[5],
373         :suppliers => h[6],
374         :number_of_references => h[7],
375         :references => h[8]
376       }
377       index = 1
378       h = {}
379     end
380   end  # lines.each
381   data
382 end
parse_suppliers( lines ) click to toggle source

data is a hash indexed by the supplier code

data[supplier_code]
returns the suppliers name
    # File lib/bio/db/rebase.rb
387 def parse_suppliers( lines )
388   data = {}
389   return data if lines == nil
390   lines.each_line do |line|
391     next if line[0].chr == '#'
392     data[$1] = $2 if line =~ %r{(.+?)\s(.+)}
393   end
394   data
395 end
raw_to_reference( line ) click to toggle source

Takes a string in one of the three formats listed below and returns a Bio::Reference object

  • Possible input styles: a = ‘Inagaki, K., Hikita, T., Yanagidani, S., Nomura, Y., Kishimoto, N., Tano, T., Tanaka, H., (1993) Biosci. Biotechnol. Biochem., vol. 57, pp. 1716-1721.’ b = ‘Nekrasiene, D., Lapcinskaja, S., Kiuduliene, L., Vitkute, J., Janulaitis, A., Unpublished observations.’ c = “Grigaite, R., Maneliene, Z., Janulaitis, A., (2002) Nucleic Acids Res., vol. 30.”

    # File lib/bio/db/rebase.rb
403 def raw_to_reference( line )
404   a = line.split(', ')
405 
406   if a[-1] == 'Unpublished observations.'
407     title = a.pop.chop
408     pages = volume = year = journal = ''
409   else
410     title = ''
411 
412     pages_or_volume = a.pop.chop
413     if pages_or_volume =~ %r{pp\.\s}
414       pages = pages_or_volume
415       pages.gsub!('pp. ', '')
416       volume = a.pop
417     else
418       pages = ''
419       volume = pages_or_volume
420     end
421 
422     volume.gsub!('vol. ', '')
423 
424     year_and_journal = a.pop
425     year_and_journal =~ %r{\((\d+)\)\s(.+)}
426     year = $1
427     journal = $2
428   end
429 
430   authors = []
431 
432   last_name = nil
433   a.each do |e|
434     if last_name
435       authors << "#{last_name}, #{e}"
436       last_name = nil
437     else
438       last_name = e
439     end
440   end
441 
442   ref = {
443     'title' => title,
444     'pages' => pages,
445     'volume' => volume,
446     'year' => year,
447     'journal' => journal,
448     'authors' => authors,
449   }
450 
451   Bio::Reference.new(ref)
452 end
setup_enzyme_and_reference_association() click to toggle source
    # File lib/bio/db/rebase.rb
295 def setup_enzyme_and_reference_association
296   return unless @reference_data
297   @reference_data.each do |name, hash|
298     d = @data[name]
299     [:organism, :isoschizomers, 
300     :methylation, :source].each { |k| d[k] = hash[k] }
301     d.suppliers = hash[:suppliers].split('')
302     d.references = []
303     hash[:references].each { |k| d.references << raw_to_reference(k) }
304   end
305 end
setup_enzyme_data() click to toggle source
    # File lib/bio/db/rebase.rb
269 def setup_enzyme_data
270   @data = {}
271   
272   @enzyme_data.each do |name, hash|
273     @data[name] = EnzymeEntry.new
274     d = @data[name]
275     d.pattern                   = hash[:pattern]
276     # d.blunt?= is a syntax error
277     d[:blunt?] = (hash[:blunt].to_i == 1 ? true : false)
278     d.primary_strand_cut1       = hash[:c1].to_i
279     d.complementary_strand_cut1 = hash[:c2].to_i
280     d.primary_strand_cut2       = hash[:c3].to_i
281     d.complementary_strand_cut2 = hash[:c4].to_i
282 
283     # Set up keys just in case there's no reference data supplied
284     [:organism, :isoschizomers, 
285     :methylation, :source].each { |k| d[k] = '' }
286     d.suppliers = []
287     d.references = []
288   end
289 
290   @enzyme_names = @data.keys.sort
291   @enzyme_names_downcased = @enzyme_names.map{|a| a.downcase}
292   setup_enzyme_and_reference_association
293 end