class Bio::FastaDefline
Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs
(NCBI
standard FASTA sequence identifiers) or “:”-separated IDs.
specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
Examples¶ ↑
rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]') rub.entry_id ==> 'gi|671595' rub.get('emb') ==> 'CAA85678.1' rub.emb ==> 'CAA85678.1' rub.gi ==> '671595' rub.accession ==> 'CAA85678' rub.accessions ==> [ 'CAA85678' ] rub.acc_version ==> 'CAA85678.1' rub.locus ==> nil rub.list_ids ==> [["gi", "671595"], ["emb", "CAA85678.1", nil], ["Perovskia abrotanoides"]] ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]") ckr.entry_id ==> "gi|2495000" ckr.sp ==> "CCKR_CAVPO" ckr.pir ==> "I51898" ckr.gb ==> "AAB29504.1" ckr.gi ==> "2495000" ckr.accession ==> "AAB29504" ckr.accessions ==> ["Q63931", "AAB29504"] ckr.acc_version ==> "AAB29504.1" ckr.locus ==> nil ckr.description ==> "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)" ckr.descriptions ==> ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)", "cholecystokinin A receptor - guinea pig", "cholecystokinin A receptor; CCK-A receptor [Cavia]"] ckr.words ==> ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig", "receptor", "type"] ckr.id_strings ==> ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898", "544724", "AAB29504.1", "Cavia"] ckr.list_ids ==> [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"], ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"], ["gb", "AAB29504.1", nil], ["Cavia"]]
References
¶ ↑
-
Fasta
format description (NCBI
) www.ncbi.nlm.nih.gov/BLAST/fasta.shtml -
Frequently Asked Questions: Indexing of
Sequence
Identifiers (by Warren R. Gish.) (Dead link. Please find in web.archive.org/ ). blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers -
Program Parameters for formatdb and fastacmd (by Tao Tao) www.ncbi.nlm.nih.gov/staff/tao/URLAPI/formatdb_fastacmd.html#t1.1
-
Formatdb README ftp.ncbi.nih.gov/blast/documents/formatdb.html
Constants
- KillRegexpArray
- KillWords
- KillWordsHash
- NSIDs
Attributes
Shows a possibly unique identifier. Returns a string.
Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings.
Public Class Methods
Parses given string.
# File lib/bio/db/fasta/defline.rb 180 def initialize(str) 181 @deflines = [] 182 @info = {} 183 @list_ids = [] 184 185 @entry_id = nil 186 187 lines = str.split("\x01") 188 lines.each do |line| 189 add_defline(line) 190 end 191 end
Public Instance Methods
Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb 492 def acc_version 493 unless defined?(@acc_version) then 494 @acc_version = get_by_type('acc_version') 495 end 496 @acc_version 497 end
Shows an accession number.
# File lib/bio/db/fasta/defline.rb 510 def accession 511 unless defined?(@accession) then 512 if acc_version then 513 @accession = acc_version.split('.')[0] 514 else 515 @accession = accessions[0] 516 end 517 end 518 @accession 519 end
Shows accession numbers. Returns an array of strings.
# File lib/bio/db/fasta/defline.rb 501 def accessions 502 unless defined?(@accessions) then 503 @accessions = get_all_by_type('accession', 'acc_version') 504 @accessions.collect! { |x| x.sub(/\..*\z/, '') } 505 end 506 @accessions 507 end
Parses given string and adds parsed data.
# File lib/bio/db/fasta/defline.rb 194 def add_defline(str) 195 case str 196 when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/ 197 # NSIDs 198 # examples: 199 # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P 200 # 201 # note: regexp (:?) means grouping without backreferences 202 i = $1 203 d = $2 204 tks = i.split('|') 205 tks << '' if i[-1,1] == '|' 206 a = parse_NSIDs(tks) 207 i = a[0].join('|') 208 a.unshift('|') 209 d = tks.join('|') + ' ' + d unless tks.empty? 210 a << d 211 this_line = a 212 match_EC(d) 213 parse_square_brackets(d).each do |x| 214 if !match_EC(x, false) and x =~ /\A[A-Z]/ then 215 di = [ x ] 216 @list_ids << di 217 @info['organism'] = x unless @info['organism'] 218 end 219 end 220 221 when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/ 222 # examples: 223 # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST] 224 # >emb:CACDC28 [X80034] C.albicans CDC28 gene 225 i = $1 226 d = $2 227 a = parse_ColonSepID(i) 228 i = a.join(':') 229 this_line = [ ':', a , d ] 230 match_EC(d) 231 parse_square_brackets(d).each do |x| 232 if !match_EC(x, false) and x =~ /:/ then 233 parse_ColonSepID(x) 234 elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then 235 @list_ids << [ $1 ] 236 end 237 end 238 239 when /^\>?\s*(\S+)(?:\s+(.+))?$/ 240 # examples: 241 # >ABC12345 this is test 242 i = $1 243 d = $2.to_s 244 @list_ids << [ i.chomp('.') ] 245 this_line = [ '', [ i ], d ] 246 match_EC(d) 247 else 248 i = str 249 d = '' 250 match_EC(i) 251 this_line = [ '', [ i ], d ] 252 end 253 254 @deflines << this_line 255 @entry_id = i unless @entry_id 256 end
Shows description.
# File lib/bio/db/fasta/defline.rb 335 def description 336 @deflines[0].to_a[-1] 337 end
Returns descriptions.
# File lib/bio/db/fasta/defline.rb 340 def descriptions 341 @deflines.collect do |a| 342 a[-1] 343 end 344 end
Returns identifires by a database name.
# File lib/bio/db/fasta/defline.rb 416 def get(dbname) 417 db = dbname.to_s 418 r = nil 419 unless r = @info[db] then 420 di = @list_ids.find { |x| x[0] == db.to_s } 421 if di and di.size <= 2 then 422 r = di[-1] 423 elsif di then 424 labels = self.class::NSIDs[db] 425 [ 'acc_version', 'entry_id', 426 'locus', 'accession', 'number'].each do |x| 427 if i = labels.index(x) then 428 r = di[i+1] 429 break if r 430 end 431 end 432 r = di[1..-1].find { |x| x } unless r 433 end 434 @info[db] = r if r 435 end 436 r 437 end
Returns identifiers by given type.
# File lib/bio/db/fasta/defline.rb 452 def get_all_by_type(*type_strarg) 453 d = [] 454 @list_ids.each do |x| 455 if labels = self.class::NSIDs[x[0]] then 456 type_strarg.each do |y| 457 if i = labels.index(y) then 458 d << x[i+1] if x[i+1] 459 end 460 end 461 end 462 end 463 d 464 end
Returns an identifier by given type.
# File lib/bio/db/fasta/defline.rb 440 def get_by_type(type_str) 441 @list_ids.each do |x| 442 if labels = self.class::NSIDs[x[0]] then 443 if i = labels.index(type_str) then 444 return x[i+1] 445 end 446 end 447 end 448 nil 449 end
Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb 481 def gi 482 unless defined?(@gi) then 483 @gi = get_by_type('gi') 484 end 485 @gi 486 end
Shows ID-like strings. Returns an array of strings.
# File lib/bio/db/fasta/defline.rb 348 def id_strings 349 r = [] 350 @list_ids.each do |a| 351 if a.size >= 2 then 352 r.concat a[1..-1].find_all { |x| x } 353 else 354 if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/ 355 r << a[0] 356 end 357 end 358 end 359 r.concat( words(true, []).find_all do |x| 360 x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or 361 x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/ 362 end) 363 r 364 end
Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb 470 def locus 471 unless defined?(@locus) 472 @locus = get_by_type('locus') 473 end 474 @locus 475 end
# File lib/bio/db/fasta/defline.rb 521 def method_missing(name, *args) 522 # raise ArgumentError, 523 # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2 524 r = get(name, *args) 525 if !r and !(self.class::NSIDs[name.to_s]) then 526 raise "NameError: undefined method `#{name.inspect}'" 527 end 528 r 529 end
Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new
method.
# File lib/bio/db/fasta/defline.rb 327 def to_s 328 @deflines.collect { |a| 329 s = a[0] 330 (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip 331 }.join("\x01") 332 end
Shows words used in the defline. Returns an Array.
# File lib/bio/db/fasta/defline.rb 390 def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray, 391 kwhash = self.class::KillWordsHash) 392 a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\# \x00-\x1f\x7f]+/) 393 a.collect! do |x| 394 x.sub!(/\A[\$\*\-\+]+/, '') 395 x.sub!(/[\$\*\-\=]+\z/, '') 396 if x.size <= 1 then 397 nil 398 elsif kwhash[x.downcase] then 399 nil 400 else 401 if kill_regexp.find { |expr| expr =~ x } then 402 nil 403 else 404 x 405 end 406 end 407 end 408 a.compact! 409 a.collect! { |x| x.downcase } unless case_sensitive 410 a.sort! 411 a.uniq! 412 a 413 end