class Bio::SOFT
bio/db/soft.rb - Interface for SOFT
formatted files
- Author
-
Trevor Wennblom <trevor@corevx.com>
- Copyright
-
Copyright © 2007 Midwinter Laboratories, LLC (midwinterlabs.com)
- License
-
The Ruby License
Description¶ ↑
“SOFT (Simple Omnibus in Text Format) is a compact, simple, line-based, ASCII text format that incorporates experimental data and metadata.” – GEO, National Center for Biotechnology Information
The Bio::SOFT
module reads SOFT
Series or Platform formatted files that contain information describing one database, one series, one platform, and many samples (GEO accessions). The data from the file can then be viewed with Ruby methods.
Bio::SOFT
also supports the reading of SOFT
DataSet files which contain one database, one dataset, and many subsets.
Format specification is located here:
SOFT
data files may be directly downloaded here:
NCBI’s Gene Expression Omnibus (GEO) is here:
Usage¶ ↑
If an attribute has more than one value then the values are stored in an Array of String objects. Otherwise the attribute is stored as a String.
The platform and each sample may contain a table of data. A dataset from a DataSet file may also contain a table.
Attributes are dynamically created based on the data in the file. Predefined keys have not been created in advance due to the variability of SOFT
files in-the-wild.
Keys are generally stored as Symbols. In the case of keys for samples and table headings may alternatively be accessed with Strings. The names of samples (geo accessions) are case sensitive. Table headers are case insensitive.
require 'bio' lines = IO.readlines('GSE3457_family.soft') soft = Bio::SOFT.new(lines) soft.platform[:geo_accession] # => "GPL2092" soft.platform[:organism] # => "Populus" soft.platform[:contributor] # => ["Jingyi,,Li", "Olga,,Shevchenko", "Steve,H,Strauss", "Amy,M,Brunner"] soft.platform[:data_row_count] # => "240" soft.platform.keys.sort {|a,b| a.to_s <=> b.to_s}[0..2] # => [:contact_address, :contact_city, :contact_country] soft.platform[:"contact_zip/postal_code"] # => "97331" soft.platform[:table].header # => ["ID", "GB_ACC", "SPOT_ID", "Function/Family", "ORGANISM", "SEQUENCE"] soft.platform[:table].header_description # => {"ORGANISM"=>"sequence sources", "SEQUENCE"=>"oligo sequence used", "Function/Family"=>"gene functions and family", "ID"=>"", "SPOT_ID"=>"", "GB_ACC"=>"Gene bank accession number"} soft.platform[:table].rows.size # => 240 soft.platform[:table].rows[5] # => ["A039P68U", "AI163321", "", "TF, flowering protein CONSTANS", "P. tremula x P. tremuloides", "AGAAAATTCGATATACTGTCCGTAAAGAGGTAGCACTTAGAATGCAACGGAATAAAGGGCAGTTCACCTC"] soft.platform[:table].rows[5][4] # => "P. tremula x P. tremuloides" soft.platform[:table].rows[5][:organism] # => "P. tremula x P. tremuloides" soft.platform[:table].rows[5]['ORGANISM'] # => "P. tremula x P. tremuloides" soft.series[:geo_accession] # => "GSE3457" soft.series[:contributor] # => ["Jingyi,,Li", "Olga,,Shevchenko", "Ove,,Nilsson", "Steve,H,Strauss", "Amy,M,Brunner"] soft.series[:platform_id] # => "GPL2092" soft.series[:sample_id].size # => 74 soft.series[:sample_id][0..4] # => ["GSM77557", "GSM77558", "GSM77559", "GSM77560", "GSM77561"] soft.database[:name] # => "Gene Expression Omnibus (GEO)" soft.database[:ref] # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6" soft.database[:institute] # => "NCBI NLM NIH" soft.samples.size # => 74 soft.samples[:GSM77600][:series_id] # => "GSE3457" soft.samples['GSM77600'][:series_id] # => "GSE3457" soft.samples[:GSM77600][:platform_id] # => "GPL2092" soft.samples[:GSM77600][:type] # => "RNA" soft.samples[:GSM77600][:title] # => "jst2b2" soft.samples[:GSM77600][:table].header # => ["ID_REF", "VALUE"] soft.samples[:GSM77600][:table].header_description # => {"ID_REF"=>"", "VALUE"=>"normalized signal intensities"} soft.samples[:GSM77600][:table].rows.size # => 217 soft.samples[:GSM77600][:table].rows[5] # => ["A039P68U", "8.19"] soft.samples[:GSM77600][:table].rows[5][0] # => "A039P68U" soft.samples[:GSM77600][:table].rows[5][:id_ref] # => "A039P68U" soft.samples[:GSM77600][:table].rows[5]['ID_REF'] # => "A039P68U" lines = IO.readlines('GDS100.soft') soft = Bio::SOFT.new(lines) soft.database[:name] # => "Gene Expression Omnibus (GEO)" soft.database[:ref] # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6" soft.database[:institute] # => "NCBI NLM NIH" soft.subsets.size # => 8 soft.subsets.keys # => ["GDS100_1", "GDS100_2", "GDS100_3", "GDS100_4", "GDS100_5", "GDS100_6", "GDS100_7", "GDS100_8"] soft.subsets[:GDS100_7] # => {:dataset_id=>"GDS100", :type=>"time", :sample_id=>"GSM548,GSM543", :description=>"60 minute"} soft.subsets['GDS100_7'][:sample_id] # => "GSM548,GSM543" soft.subsets[:GDS100_7][:sample_id] # => "GSM548,GSM543" soft.subsets[:GDS100_7][:dataset_id] # => "GDS100" soft.dataset[:order] # => "none" soft.dataset[:sample_organism] # => "Escherichia coli" soft.dataset[:table].header # => ["ID_REF", "IDENTIFIER", "GSM549", "GSM542", "GSM543", "GSM547", "GSM544", "GSM545", "GSM546", "GSM548"] soft.dataset[:table].rows.size # => 5764 soft.dataset[:table].rows[5] # => ["6", "EMPTY", "0.097", "0.217", "0.242", "0.067", "0.104", "0.162", "0.104", "0.154"] soft.dataset[:table].rows[5][4] # => "0.242" soft.dataset[:table].rows[5][:gsm549] # => "0.097" soft.dataset[:table].rows[5][:GSM549] # => "0.097" soft.dataset[:table].rows[5]['GSM549'] # => "0.097"
Constants
- LINE_TYPE_ENTITY_ATTRIBUTE
- LINE_TYPE_ENTITY_INDICATOR
- LINE_TYPE_TABLE_HEADER
- TABLE_COLUMN_DELIMITER
data table row defined by absence of line type character
Attributes
Public Class Methods
Constructor
Arguments
-
lines
: (required) contents ofSOFT
formatted file
- Returns
# File lib/bio/db/soft.rb 147 def initialize(lines=nil) 148 @database = Database.new 149 150 @series = Series.new 151 @platform = Platform.new 152 @samples = Samples.new 153 154 @dataset = Dataset.new 155 @subsets = Subsets.new 156 157 process(lines) 158 end
Protected Instance Methods
# File lib/bio/db/soft.rb 381 def custom_raise( line_number_with_0_based_indexing, msg ) 382 raise ["Error processing input line: #{line_number_with_0_based_indexing+1}", 383 msg].join("\t") 384 end
# File lib/bio/db/soft.rb 354 def error_msg( i, extra_info=nil ) 355 case i 356 when 10 357 x = ["Lines without line-type characters are rows in a table, but", 358 "a line containing an entity indicator such as", 359 "\"#{LINE_TYPE_ENTITY_INDICATOR}SAMPLE\",", 360 "\"#{LINE_TYPE_ENTITY_INDICATOR}PLATFORM\",", 361 "or \"#{LINE_TYPE_ENTITY_INDICATOR}DATASET\" has not been", 362 "previously encountered or it does not appear that this line is", 363 "in a table."] 364 when 20 365 # tables are allowed inside samples and platforms 366 x = ["Tables are only allowed inside SAMPLE and PLATFORM.", 367 "Current table information found inside #{extra_info}."] 368 when 30 369 x = ["Entity attribute line (\"#{LINE_TYPE_ENTITY_ATTRIBUTE}\")", 370 "found before entity indicator line (\"#{LINE_TYPE_ENTITY_INDICATOR}\")"] 371 when 40 372 x = ["Unkown entity indicator. Must be DATABASE, SAMPLE, PLATFORM,", 373 "SERIES, DATASET, or SUBSET."] 374 else 375 raise IndexError, "Unknown error message requested." 376 end 377 378 x.join(" ") 379 end
# File lib/bio/db/soft.rb 272 def process(lines) 273 current_indicator = nil 274 current_class_accessor = nil 275 in_table = false 276 277 lines.each_with_index do |line, line_number| 278 line.strip! 279 next if line.nil? or line.empty? 280 case line[0].chr 281 when LINE_TYPE_ENTITY_INDICATOR 282 current_indicator, value = split_label_value_in( line[1..-1] ) 283 284 case current_indicator 285 when 'DATABASE' 286 current_class_accessor = @database 287 when 'DATASET' 288 current_class_accessor = @dataset 289 when 'PLATFORM' 290 current_class_accessor = @platform 291 when 'SERIES' 292 current_class_accessor = @series 293 when 'SAMPLE' 294 @samples[value] = Sample.new 295 current_class_accessor = @samples[value] 296 when 'SUBSET' 297 @subsets[value] = Subset.new 298 current_class_accessor = @subsets[value] 299 else 300 custom_raise( line_number, error_msg(40, line) ) 301 end 302 303 when LINE_TYPE_ENTITY_ATTRIBUTE 304 if( current_indicator == nil ) 305 custom_raise( line_number, error_msg(30) ) 306 end 307 308 # Handle lines such as '!platform_table_begin' and '!platform_table_end' 309 if in_table 310 if line =~ %r{table_begin} 311 next 312 elsif line =~ %r{table_end} 313 in_table = false 314 next 315 end 316 end 317 318 key, value = split_label_value_in( line, true ) 319 key_s = key.to_sym 320 321 if current_class_accessor.include?( key_s ) 322 if current_class_accessor[ key_s ].class != Array 323 current_class_accessor[ key_s ] = [ current_class_accessor[ key_s ] ] 324 end 325 current_class_accessor[key.to_sym] << value 326 else 327 current_class_accessor[key.to_sym] = value 328 end 329 330 when LINE_TYPE_TABLE_HEADER 331 if( (current_indicator != 'SAMPLE') and (current_indicator != 'PLATFORM') and (current_indicator != 'DATASET') ) 332 custom_raise( line_number, error_msg(20, current_indicator.inspect) ) 333 end 334 335 in_table = true # may be redundant, computationally not worth checking 336 337 # We only expect one table per platform or sample 338 current_class_accessor[:table] ||= Table.new 339 key, value = split_label_value_in( line ) 340 # key[1..-1] -- Remove first character which is the LINE_TYPE_TABLE_HEADER 341 current_class_accessor[:table].header_description[ key[1..-1] ] = value 342 343 else 344 # Type: No line type - should be a row in a table. 345 346 if( (current_indicator == nil) or (in_table == false) ) 347 custom_raise( line_number, error_msg(10) ) 348 end 349 current_class_accessor[:table].add_header_or_row( line ) 350 end 351 end 352 end
# File lib/bio/db/soft.rb 386 def split_label_value_in( line, shift_key=false ) 387 line =~ %r{\s*=\s*} 388 key, value = $`, $' 389 390 if shift_key 391 key =~ %r{_} 392 key = $' 393 end 394 395 if( (key == nil) or (value == nil) ) 396 puts line.inspect 397 raise 398 end 399 400 [key, value] 401 end