class Bio::SOFT

bio/db/soft.rb - Interface for SOFT formatted files

Author

Trevor Wennblom <trevor@corevx.com>

Copyright

Copyright © 2007 Midwinter Laboratories, LLC (midwinterlabs.com)

License

The Ruby License

Description

“SOFT (Simple Omnibus in Text Format) is a compact, simple, line-based, ASCII text format that incorporates experimental data and metadata.” – GEO, National Center for Biotechnology Information

The Bio::SOFT module reads SOFT Series or Platform formatted files that contain information describing one database, one series, one platform, and many samples (GEO accessions). The data from the file can then be viewed with Ruby methods.

Bio::SOFT also supports the reading of SOFT DataSet files which contain one database, one dataset, and many subsets.

Format specification is located here:

SOFT data files may be directly downloaded here:

NCBI’s Gene Expression Omnibus (GEO) is here:

Usage

If an attribute has more than one value then the values are stored in an Array of String objects. Otherwise the attribute is stored as a String.

The platform and each sample may contain a table of data. A dataset from a DataSet file may also contain a table.

Attributes are dynamically created based on the data in the file. Predefined keys have not been created in advance due to the variability of SOFT files in-the-wild.

Keys are generally stored as Symbols. In the case of keys for samples and table headings may alternatively be accessed with Strings. The names of samples (geo accessions) are case sensitive. Table headers are case insensitive.

require 'bio'

lines = IO.readlines('GSE3457_family.soft') 
soft = Bio::SOFT.new(lines)

soft.platform[:geo_accession]             # => "GPL2092"
soft.platform[:organism]                  # => "Populus"
soft.platform[:contributor]               # => ["Jingyi,,Li", "Olga,,Shevchenko", "Steve,H,Strauss", "Amy,M,Brunner"]
soft.platform[:data_row_count]            # => "240"
soft.platform.keys.sort {|a,b| a.to_s <=> b.to_s}[0..2] # => [:contact_address, :contact_city, :contact_country]
soft.platform[:"contact_zip/postal_code"] # => "97331"
soft.platform[:table].header              # => ["ID", "GB_ACC", "SPOT_ID", "Function/Family", "ORGANISM", "SEQUENCE"]
soft.platform[:table].header_description  # => {"ORGANISM"=>"sequence sources", "SEQUENCE"=>"oligo sequence used", "Function/Family"=>"gene functions and family", "ID"=>"", "SPOT_ID"=>"", "GB_ACC"=>"Gene bank accession number"}
soft.platform[:table].rows.size           # => 240
soft.platform[:table].rows[5]             # => ["A039P68U", "AI163321", "", "TF, flowering protein CONSTANS", "P. tremula x P. tremuloides", "AGAAAATTCGATATACTGTCCGTAAAGAGGTAGCACTTAGAATGCAACGGAATAAAGGGCAGTTCACCTC"]
soft.platform[:table].rows[5][4]          # => "P. tremula x P. tremuloides"
soft.platform[:table].rows[5][:organism]  # => "P. tremula x P. tremuloides"
soft.platform[:table].rows[5]['ORGANISM'] # => "P. tremula x P. tremuloides"

soft.series[:geo_accession]               # => "GSE3457"
soft.series[:contributor]                 # => ["Jingyi,,Li", "Olga,,Shevchenko", "Ove,,Nilsson", "Steve,H,Strauss", "Amy,M,Brunner"]
soft.series[:platform_id]                 # => "GPL2092"
soft.series[:sample_id].size              # => 74
soft.series[:sample_id][0..4]             # => ["GSM77557", "GSM77558", "GSM77559", "GSM77560", "GSM77561"]

soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
soft.database[:institute]                 # => "NCBI NLM NIH"

soft.samples.size                         # => 74
soft.samples[:GSM77600][:series_id]       # => "GSE3457"
soft.samples['GSM77600'][:series_id]      # => "GSE3457"
soft.samples[:GSM77600][:platform_id]     # => "GPL2092"
soft.samples[:GSM77600][:type]            # => "RNA"
soft.samples[:GSM77600][:title]           # => "jst2b2"
soft.samples[:GSM77600][:table].header    # => ["ID_REF", "VALUE"]
soft.samples[:GSM77600][:table].header_description # => {"ID_REF"=>"", "VALUE"=>"normalized signal intensities"}
soft.samples[:GSM77600][:table].rows.size # => 217
soft.samples[:GSM77600][:table].rows[5]   # => ["A039P68U", "8.19"]
soft.samples[:GSM77600][:table].rows[5][0]        # => "A039P68U"
soft.samples[:GSM77600][:table].rows[5][:id_ref]  # => "A039P68U"
soft.samples[:GSM77600][:table].rows[5]['ID_REF'] # => "A039P68U"

lines = IO.readlines('GDS100.soft') 
soft = Bio::SOFT.new(lines)

soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
soft.database[:institute]                 # => "NCBI NLM NIH"

soft.subsets.size                         # => 8
soft.subsets.keys                         # => ["GDS100_1", "GDS100_2", "GDS100_3", "GDS100_4", "GDS100_5", "GDS100_6", "GDS100_7", "GDS100_8"]
soft.subsets[:GDS100_7]                   # => {:dataset_id=>"GDS100", :type=>"time", :sample_id=>"GSM548,GSM543", :description=>"60 minute"}
soft.subsets['GDS100_7'][:sample_id]      # => "GSM548,GSM543"
soft.subsets[:GDS100_7][:sample_id]       # => "GSM548,GSM543"
soft.subsets[:GDS100_7][:dataset_id]      # => "GDS100"

soft.dataset[:order]                      # => "none"
soft.dataset[:sample_organism]            # => "Escherichia coli"
soft.dataset[:table].header               # => ["ID_REF", "IDENTIFIER", "GSM549", "GSM542", "GSM543", "GSM547", "GSM544", "GSM545", "GSM546", "GSM548"]
soft.dataset[:table].rows.size            # => 5764
soft.dataset[:table].rows[5]              # => ["6", "EMPTY", "0.097", "0.217", "0.242", "0.067", "0.104", "0.162", "0.104", "0.154"]
soft.dataset[:table].rows[5][4]           # => "0.242"
soft.dataset[:table].rows[5][:gsm549]     # => "0.097"
soft.dataset[:table].rows[5][:GSM549]     # => "0.097"
soft.dataset[:table].rows[5]['GSM549']    # => "0.097"