class Bio::SOFT

bio/db/soft.rb - Interface for SOFT formatted files

Author

Trevor Wennblom <trevor@corevx.com>

Copyright

Copyright © 2007 Midwinter Laboratories, LLC (midwinterlabs.com)

License

The Ruby License

Description

“SOFT (Simple Omnibus in Text Format) is a compact, simple, line-based, ASCII text format that incorporates experimental data and metadata.” – GEO, National Center for Biotechnology Information

The Bio::SOFT module reads SOFT Series or Platform formatted files that contain information describing one database, one series, one platform, and many samples (GEO accessions). The data from the file can then be viewed with Ruby methods.

Bio::SOFT also supports the reading of SOFT DataSet files which contain one database, one dataset, and many subsets.

Format specification is located here:

SOFT data files may be directly downloaded here:

NCBI's Gene Expression Omnibus (GEO) is here:

Usage

If an attribute has more than one value then the values are stored in an Array of String objects. Otherwise the attribute is stored as a String.

The platform and each sample may contain a table of data. A dataset from a DataSet file may also contain a table.

Attributes are dynamically created based on the data in the file. Predefined keys have not been created in advance due to the variability of SOFT files in-the-wild.

Keys are generally stored as Symbols. In the case of keys for samples and table headings may alternatively be accessed with Strings. The names of samples (geo accessions) are case sensitive. Table headers are case insensitive.

require 'bio'

lines = IO.readlines('GSE3457_family.soft') 
soft = Bio::SOFT.new(lines)

soft.platform[:geo_accession]             # => "GPL2092"
soft.platform[:organism]                  # => "Populus"
soft.platform[:contributor]               # => ["Jingyi,,Li", "Olga,,Shevchenko", "Steve,H,Strauss", "Amy,M,Brunner"]
soft.platform[:data_row_count]            # => "240"
soft.platform.keys.sort {|a,b| a.to_s <=> b.to_s}[0..2] # => [:contact_address, :contact_city, :contact_country]
soft.platform[:"contact_zip/postal_code"] # => "97331"
soft.platform[:table].header              # => ["ID", "GB_ACC", "SPOT_ID", "Function/Family", "ORGANISM", "SEQUENCE"]
soft.platform[:table].header_description  # => {"ORGANISM"=>"sequence sources", "SEQUENCE"=>"oligo sequence used", "Function/Family"=>"gene functions and family", "ID"=>"", "SPOT_ID"=>"", "GB_ACC"=>"Gene bank accession number"}
soft.platform[:table].rows.size           # => 240
soft.platform[:table].rows[5]             # => ["A039P68U", "AI163321", "", "TF, flowering protein CONSTANS", "P. tremula x P. tremuloides", "AGAAAATTCGATATACTGTCCGTAAAGAGGTAGCACTTAGAATGCAACGGAATAAAGGGCAGTTCACCTC"]
soft.platform[:table].rows[5][4]          # => "P. tremula x P. tremuloides"
soft.platform[:table].rows[5][:organism]  # => "P. tremula x P. tremuloides"
soft.platform[:table].rows[5]['ORGANISM'] # => "P. tremula x P. tremuloides"

soft.series[:geo_accession]               # => "GSE3457"
soft.series[:contributor]                 # => ["Jingyi,,Li", "Olga,,Shevchenko", "Ove,,Nilsson", "Steve,H,Strauss", "Amy,M,Brunner"]
soft.series[:platform_id]                 # => "GPL2092"
soft.series[:sample_id].size              # => 74
soft.series[:sample_id][0..4]             # => ["GSM77557", "GSM77558", "GSM77559", "GSM77560", "GSM77561"]

soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
soft.database[:institute]                 # => "NCBI NLM NIH"

soft.samples.size                         # => 74
soft.samples[:GSM77600][:series_id]       # => "GSE3457"
soft.samples['GSM77600'][:series_id]      # => "GSE3457"
soft.samples[:GSM77600][:platform_id]     # => "GPL2092"
soft.samples[:GSM77600][:type]            # => "RNA"
soft.samples[:GSM77600][:title]           # => "jst2b2"
soft.samples[:GSM77600][:table].header    # => ["ID_REF", "VALUE"]
soft.samples[:GSM77600][:table].header_description # => {"ID_REF"=>"", "VALUE"=>"normalized signal intensities"}
soft.samples[:GSM77600][:table].rows.size # => 217
soft.samples[:GSM77600][:table].rows[5]   # => ["A039P68U", "8.19"]
soft.samples[:GSM77600][:table].rows[5][0]        # => "A039P68U"
soft.samples[:GSM77600][:table].rows[5][:id_ref]  # => "A039P68U"
soft.samples[:GSM77600][:table].rows[5]['ID_REF'] # => "A039P68U"

lines = IO.readlines('GDS100.soft') 
soft = Bio::SOFT.new(lines)

soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
soft.database[:institute]                 # => "NCBI NLM NIH"

soft.subsets.size                         # => 8
soft.subsets.keys                         # => ["GDS100_1", "GDS100_2", "GDS100_3", "GDS100_4", "GDS100_5", "GDS100_6", "GDS100_7", "GDS100_8"]
soft.subsets[:GDS100_7]                   # => {:dataset_id=>"GDS100", :type=>"time", :sample_id=>"GSM548,GSM543", :description=>"60 minute"}
soft.subsets['GDS100_7'][:sample_id]      # => "GSM548,GSM543"
soft.subsets[:GDS100_7][:sample_id]       # => "GSM548,GSM543"
soft.subsets[:GDS100_7][:dataset_id]      # => "GDS100"

soft.dataset[:order]                      # => "none"
soft.dataset[:sample_organism]            # => "Escherichia coli"
soft.dataset[:table].header               # => ["ID_REF", "IDENTIFIER", "GSM549", "GSM542", "GSM543", "GSM547", "GSM544", "GSM545", "GSM546", "GSM548"]
soft.dataset[:table].rows.size            # => 5764
soft.dataset[:table].rows[5]              # => ["6", "EMPTY", "0.097", "0.217", "0.242", "0.067", "0.104", "0.162", "0.104", "0.154"]
soft.dataset[:table].rows[5][4]           # => "0.242"
soft.dataset[:table].rows[5][:gsm549]     # => "0.097"
soft.dataset[:table].rows[5][:GSM549]     # => "0.097"
soft.dataset[:table].rows[5]['GSM549']    # => "0.097"

Constants

LINE_TYPE_ENTITY_ATTRIBUTE
LINE_TYPE_ENTITY_INDICATOR
LINE_TYPE_TABLE_HEADER
TABLE_COLUMN_DELIMITER

data table row defined by absence of line type character

Attributes

database[RW]
dataset[RW]
platform[RW]
samples[RW]
series[RW]
subsets[RW]

Public Class Methods

new(lines=nil) click to toggle source

Constructor


Arguments

  • lines: (required) contents of SOFT formatted file

Returns

Bio::SOFT

# File lib/bio/db/soft.rb, line 147
def initialize(lines=nil)
  @database = Database.new
  
  @series = Series.new
  @platform = Platform.new
  @samples = Samples.new
  
  @dataset = Dataset.new
  @subsets = Subsets.new
  
  process(lines)
end

Protected Instance Methods

custom_raise( line_number_with_0_based_indexing, msg ) click to toggle source
# File lib/bio/db/soft.rb, line 381
def custom_raise( line_number_with_0_based_indexing, msg )
  raise ["Error processing input line: #{line_number_with_0_based_indexing+1}",
    msg].join("\t")
end
error_msg( i, extra_info=nil ) click to toggle source
# File lib/bio/db/soft.rb, line 354
def error_msg( i, extra_info=nil )
  case i
  when 10
    x = ["Lines without line-type characters are rows in a table, but",
    "a line containing an entity indicator such as",
    "\"#{LINE_TYPE_ENTITY_INDICATOR}SAMPLE\",",
    "\"#{LINE_TYPE_ENTITY_INDICATOR}PLATFORM\",",
    "or \"#{LINE_TYPE_ENTITY_INDICATOR}DATASET\" has not been",
    "previously encountered or it does not appear that this line is",
    "in a table."]
  when 20
    # tables are allowed inside samples and platforms
    x = ["Tables are only allowed inside SAMPLE and PLATFORM.",
      "Current table information found inside #{extra_info}."]
  when 30
    x = ["Entity attribute line (\"#{LINE_TYPE_ENTITY_ATTRIBUTE}\")",
      "found before entity indicator line (\"#{LINE_TYPE_ENTITY_INDICATOR}\")"]
  when 40
    x = ["Unkown entity indicator.  Must be DATABASE, SAMPLE, PLATFORM,",
      "SERIES, DATASET, or SUBSET."]
  else
    raise IndexError, "Unknown error message requested."
  end
  
  x.join(" ")
end
process(lines) click to toggle source
# File lib/bio/db/soft.rb, line 272
def process(lines)
  current_indicator = nil
  current_class_accessor = nil
  in_table = false
      
  lines.each_with_index do |line, line_number|
    line.strip!
    next if line.nil? or line.empty?
    case line[0].chr
    when LINE_TYPE_ENTITY_INDICATOR
      current_indicator, value = split_label_value_in( line[1..-1] )

      case current_indicator
      when 'DATABASE'
        current_class_accessor = @database
      when 'DATASET'
        current_class_accessor = @dataset
      when 'PLATFORM'
        current_class_accessor = @platform
      when 'SERIES'
        current_class_accessor = @series
      when 'SAMPLE'
        @samples[value] = Sample.new
        current_class_accessor = @samples[value]
      when 'SUBSET'
        @subsets[value] = Subset.new
        current_class_accessor = @subsets[value]
      else
        custom_raise( line_number, error_msg(40, line) )
      end
        
    when LINE_TYPE_ENTITY_ATTRIBUTE
      if( current_indicator == nil )
        custom_raise( line_number, error_msg(30) )
      end
      
      # Handle lines such as '!platform_table_begin' and '!platform_table_end'
      if in_table
        if line =~ %r{table_begin}
          next
        elsif line =~ %r{table_end}
          in_table = false
          next
        end
      end
      
      key, value = split_label_value_in( line, true )
      key_s = key.to_sym
      
      if current_class_accessor.include?( key_s )
        if current_class_accessor[ key_s ].class != Array
          current_class_accessor[ key_s ] = [ current_class_accessor[ key_s ] ]
        end
        current_class_accessor[key.to_sym] << value
      else
        current_class_accessor[key.to_sym] = value
      end
      
    when LINE_TYPE_TABLE_HEADER
      if( (current_indicator != 'SAMPLE') and (current_indicator != 'PLATFORM') and (current_indicator != 'DATASET') )
        custom_raise( line_number, error_msg(20, current_indicator.inspect) )
      end
      
      in_table = true   # may be redundant, computationally not worth checking

      # We only expect one table per platform or sample
      current_class_accessor[:table] ||= Table.new
      key, value = split_label_value_in( line )
      # key[1..-1] -- Remove first character which is the LINE_TYPE_TABLE_HEADER
      current_class_accessor[:table].header_description[ key[1..-1] ] = value
      
    else
      # Type: No line type - should be a row in a table.
      
      if( (current_indicator == nil) or (in_table == false) )
        custom_raise( line_number, error_msg(10) )
      end
      current_class_accessor[:table].add_header_or_row( line )
    end
  end
end
split_label_value_in( line, shift_key=false ) click to toggle source
# File lib/bio/db/soft.rb, line 386
def split_label_value_in( line, shift_key=false )
  line =~ %r{\s*=\s*}
  key, value = $`, $'
  
  if shift_key
    key =~ %r{_}
    key = $'
  end
  
  if( (key == nil) or (value == nil) )
    puts line.inspect
    raise
  end
  
  [key, value]
end