class Bio::UniProtKB
Description¶ ↑
Parser class for UniProtKB/SwissProt and TrEMBL
database entry.
See the UniProtKB
document files and manuals.
Examples¶ ↑
str = File.read("p53_human.swiss") obj = Bio::UniProtKB.new(str) obj.entry_id #=> "P53_HUMAN"
References
¶ ↑
-
The
UniProt
Knowledgebase (UniProtKB
) www.uniprot.org/help/uniprotkb -
The Universal Protein Resource (
UniProt
) uniprot.org/ -
The UniProtKB/SwissProt/TrEMBL User Manual www.uniprot.org/docs/userman.html
Public Instance Methods
returns contents in the CC lines.
-
Bio::UniProtKB#cc
-> Hash
returns an object of contents in the TOPIC.
-
Bio::UniProtKB#cc(TOPIC)
-> Array w/in Hash, Hash
returns contents of the “ALTERNATIVE PRODUCTS”.
-
Bio::UniProtKB#cc
(‘ALTERNATIVE PRODUCTS’) -> Hash{'Event' => str, 'Named isoforms' => int, 'Comment' => str, 'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]} CC -!- ALTERNATIVE PRODUCTS: CC Event=Alternative splicing; Named isoforms=15; ... CC placentae isoforms. All tissues differentially splice exon 13; CC Name=A; Synonyms=no del; CC IsoId=P15529-1; Sequence=Displayed;
returns contents of the “DATABASE”.
-
Bio::UniProtKB#cc
(‘DATABASE’) -> Array[{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...] CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
returns contents of the “MASS SPECTROMETRY”.
-
Bio::UniProtKB#cc
(‘MASS SPECTROMETRY’) -> Array[{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...] CC -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
CC lines (>=0, optional)¶ ↑
CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT CC IN LIVER, KIDNEY, LUNG AND BRAIN. CC -!- TOPIC: FIRST LINE OF A COMMENT BLOCK; CC SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.
See also www.expasy.org/sprot/userman.html#CC_line
# File lib/bio/db/embl/uniprotkb.rb 785 def cc(topic = nil) 786 unless @data['CC'] 787 cc = Hash.new 788 comment_border= '-' * (77 - 4 + 1) 789 dlm = /-!- / 790 791 # 12KD_MYCSM has no CC lines. 792 return cc if get('CC').size == 0 793 794 cc_raw = fetch('CC') 795 796 # Removing the copyright statement. 797 cc_raw.sub!(/ *---.+---/m, '') 798 799 # Not any CC Lines without the copyright statement. 800 return cc if cc_raw == '' 801 802 begin 803 cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0] 804 _ = copyright #dummy for suppress "assigned but unused variable" 805 cc_raw = cc_raw.sub(dlm,'') 806 cc_raw.split(dlm).each do |tmp| 807 tmp = tmp.strip 808 809 if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp 810 key = $1 811 body = $2 812 body.gsub!(/- (?!AND)/,'-') 813 body.strip! 814 unless cc[key] 815 cc[key] = [body] 816 else 817 cc[key].push(body) 818 end 819 else 820 raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"', 821 '', get('CC'),''].join("\n") 822 end 823 end 824 rescue NameError 825 if fetch('CC') == '' 826 return {} 827 else 828 raise ["Error: Invalid CC Lines: [#{entry_id}]: ", 829 "\n'#{self.get('CC')}'\n", "(#{$!})"].join 830 end 831 rescue NoMethodError 832 end 833 834 @data['CC'] = cc 835 end 836 837 838 case topic 839 when 'ALLERGEN' 840 return @data['CC'][topic] 841 when 'ALTERNATIVE PRODUCTS' 842 return cc_alternative_products(@data['CC'][topic]) 843 when 'BIOPHYSICOCHEMICAL PROPERTIES' 844 return cc_biophysiochemical_properties(@data['CC'][topic]) 845 when 'BIOTECHNOLOGY' 846 return @data['CC'][topic] 847 when 'CATALITIC ACTIVITY' 848 return cc_catalytic_activity(@data['CC'][topic]) 849 when 'CAUTION' 850 return cc_caution(@data['CC'][topic]) 851 when 'COFACTOR' 852 return @data['CC'][topic] 853 when 'DEVELOPMENTAL STAGE' 854 return @data['CC'][topic].join('') 855 when 'DISEASE' 856 return @data['CC'][topic].join('') 857 when 'DOMAIN' 858 return @data['CC'][topic] 859 when 'ENZYME REGULATION' 860 return @data['CC'][topic].join('') 861 when 'FUNCTION' 862 return @data['CC'][topic].join('') 863 when 'INDUCTION' 864 return @data['CC'][topic].join('') 865 when 'INTERACTION' 866 return cc_interaction(@data['CC'][topic]) 867 when 'MASS SPECTROMETRY' 868 return cc_mass_spectrometry(@data['CC'][topic]) 869 when 'MISCELLANEOUS' 870 return @data['CC'][topic] 871 when 'PATHWAY' 872 return cc_pathway(@data['CC'][topic]) 873 when 'PHARMACEUTICAL' 874 return @data['CC'][topic] 875 when 'POLYMORPHISM' 876 return @data['CC'][topic] 877 when 'PTM' 878 return @data['CC'][topic] 879 when 'RNA EDITING' 880 return cc_rna_editing(@data['CC'][topic]) 881 when 'SIMILARITY' 882 return @data['CC'][topic] 883 when 'SUBCELLULAR LOCATION' 884 return cc_subcellular_location(@data['CC'][topic]) 885 when 'SUBUNIT' 886 return @data['CC'][topic] 887 when 'TISSUE SPECIFICITY' 888 return @data['CC'][topic] 889 when 'TOXIC DOSE' 890 return @data['CC'][topic] 891 when 'WEB RESOURCE' 892 return cc_web_resource(@data['CC'][topic]) 893 when 'DATABASE' 894 # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"]. 895 tmp = Array.new 896 db = @data['CC']['DATABASE'] 897 return db unless db 898 899 db.each do |e| 900 db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil} 901 e.sub(/.$/,'').split(/;/).each do |line| 902 case line 903 when /NAME=(.+)/ 904 db['NAME'] = $1 905 when /NOTE=(.+)/ 906 db['NOTE'] = $1 907 when /WWW="(.+)"/ 908 db['WWW'] = $1 909 when /FTP="(.+)"/ 910 db['FTP'] = $1 911 end 912 end 913 tmp.push(db) 914 end 915 return tmp 916 when nil 917 return @data['CC'] 918 else 919 return @data['CC'][topic] 920 end 921 end
Returns an Array (for new format since rel 14) or a String (for old format before rel 14) for the DE line.
Bio::EMBLDB::Common#de
# File lib/bio/db/embl/uniprotkb.rb 333 def de 334 return @data['DE'] if @data['DE'] 335 parsed_de_line = parse_DE_line_rel14(get('DE')) 336 case parsed_de_line 337 when Array # new format since rel14 338 @data['DE'] ||= parsed_de_line 339 else 340 super 341 end 342 @data['DE'] 343 end
# File lib/bio/db/embl/uniprotkb.rb 1142 def dr(key = nil) 1143 unless key 1144 embl_dr 1145 else 1146 (embl_dr[key] or []).map {|x| 1147 {'Accession' => x[0], 1148 'Version' => x[1], 1149 ' ' => x[2], 1150 'Molecular Type' => x[3]} 1151 } 1152 end 1153 end
returns a Hash of information in the DT lines.
hash keys: ['created', 'sequence', 'annotation']
Since UniProtKB
release 7.0 of 07-Feb-2006, the DT line format is changed, and the word “annotation” is no longer used in DT lines. Despite the change, the word “annotation” is still used for keeping compatibility.
returns a String of information in the DT lines by a given key.
DT Line; date (3/entry)¶ ↑
DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.) DT DD-MMM-YYY (sequence version NN) DT DD-MMM-YYY (entry version NN)
The format have been changed in UniProtKB
release 7.0 of 07-Feb-2006. Below is the older format.
Old format of DT Line; date (3/entry)¶ ↑
DT DD-MMM-YYY (rel. NN, Created) DT DD-MMM-YYY (rel. NN, Last sequence update) DT DD-MMM-YYY (rel. NN, Last annotation update)
# File lib/bio/db/embl/uniprotkb.rb 157 def dt(key = nil) 158 return dt[key] if key 159 return @data['DT'] if @data['DT'] 160 161 part = self.get('DT').split(/\n/) 162 @data['DT'] = { 163 'created' => part[0].sub(/\w{2} /,'').strip, 164 'sequence' => part[1].sub(/\w{2} /,'').strip, 165 'annotation' => part[2].sub(/\w{2} /,'').strip 166 } 167 end
returns a ENTRY_NAME in the ID line.
# File lib/bio/db/embl/uniprotkb.rb 98 def entry_id 99 id_line('ENTRY_NAME') 100 end
returns contents in the feature table.
Examples¶ ↑
sp = Bio::UniProtKB.new(entry) ft = sp.ft ft.class #=> Hash ft.keys.each do |feature_key| ft[feature_key].each do |feature| feature['From'] #=> '1' feature['To'] #=> '21' feature['Description'] #=> '' feature['FTId'] #=> '' feature['diff'] #=> [] feature['original'] #=> [feature_key, '1', '21', '', ''] end end
-
Bio::UniProtKB#ft
-> Hash{FEATURE_KEY => [{'From' => int, 'To' => int, 'Description' => aStr, 'FTId' => aStr, 'diff' => [original_residues, changed_residues], 'original' => aAry }],...}
returns an Array of the information about the feature_name in the feature table.
-
Bio::UniProtKB#ft(feature_name)
-> Array of Hash[{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]
FT Line; feature table data (>=0, optional)¶ ↑
Col Data item ----- ----------------- 1- 2 FT 6-13 Feature name 15-20 `FROM' endpoint 22-27 `TO' endpoint 35-75 Description (>=0 per key) ----- -----------------
Note: ‘FROM’ and ‘TO’ endopoints are allowed to use non-numerial charactors including ‘<’, ‘>’ or ‘?’. (c.f. ‘<1’, ‘?42’)
See also www.expasy.org/sprot/userman.html#FT_line
# File lib/bio/db/embl/uniprotkb.rb 1207 def ft(feature_key = nil) 1208 return ft[feature_key] if feature_key 1209 return @data['FT'] if @data['FT'] 1210 1211 ftstr = get('FT') 1212 ftlines = ftstr.split("\n") 1213 for i in 0..10 do 1214 if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ ftlines[i] && 1215 /^FT +\/([^\s\=]+)(?:\=(\")?(.+)(\")?)?\s*$/ =~ ftlines[i+1] then 1216 fmt_2019_11 = true 1217 break #for i 1218 end 1219 end #for i 1220 1221 hash = if fmt_2019_11 then 1222 ft_2019_11_parser(ftlines) 1223 else 1224 ft_legacy_parser(ftlines) 1225 end 1226 @data['FT'] = hash 1227 end
returns a String of the first gene name in the GN line.
# File lib/bio/db/embl/uniprotkb.rb 448 def gene_name 449 (x = self.gene_names) ? x.first : nil 450 end
returns a Array of gene names in the GN line.
# File lib/bio/db/embl/uniprotkb.rb 437 def gene_names 438 gn # set @data['GN'] if it hasn't been already done 439 if @data['GN'].first.class == Hash then 440 @data['GN'].collect { |element| element[:name] } 441 else 442 @data['GN'].first 443 end 444 end
returns gene names in the GN line.
New UniProt/SwissProt format:
-
Bio::UniProtKB#gn
-> [ <gene record>* ]
where <gene record> is:
{ :name => '...', :synonyms => [ 's1', 's2', ... ], :loci => [ 'l1', 'l2', ... ], :orfs => [ 'o1', 'o2', ... ] }
Old format:
-
Bio::UniProtKB#gn
-> Array # AND -
Bio::UniProtKB#gn[0]
-> Array # OR
GN Line: Gene name(s) (>=0, optional)¶ ↑
# File lib/bio/db/embl/uniprotkb.rb 361 def gn 362 unless @data['GN'] 363 case fetch('GN') 364 when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/ 365 @data['GN'] = gn_uniprot_parser 366 else 367 @data['GN'] = gn_old_parser 368 end 369 end 370 @data['GN'] 371 end
The HI line¶ ↑
Bio::UniProtKB#hi
#=> hash
# File lib/bio/db/embl/uniprotkb.rb 701 def hi 702 unless @data['HI'] 703 @data['HI'] = [] 704 fetch('HI').split(/\. /).each do |hlist| 705 hash = {'Category' => '', 'Keywords' => [], 'Keyword' => ''} 706 hash['Category'], hash['Keywords'] = hlist.split(': ') 707 hash['Keywords'] = hash['Keywords'].split('; ') 708 hash['Keyword'] = hash['Keywords'].pop 709 hash['Keyword'].sub!(/\.$/, '') 710 @data['HI'] << hash 711 end 712 end 713 @data['HI'] 714 end
returns a Hash of the ID line.
returns a content (Int or String) of the ID line by a given key. Hash keys: [‘ENTRY_NAME’, ‘DATA_CLASS’, ‘MODECULE_TYPE’, ‘SEQUENCE_LENGTH’]
ID Line (since UniProtKB
release 9.0 of 31-Oct-2006)¶ ↑
ID P53_HUMAN Reviewed; 393 AA. #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."
Examples¶ ↑
obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed", "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil} obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
ID Line (older style)¶ ↑
ID P53_HUMAN STANDARD; PRT; 393 AA. #"ID #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."
Examples¶ ↑
obj.id_line #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD", "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"} obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
# File lib/bio/db/embl/uniprotkb.rb 73 def id_line(key = nil) 74 return id_line[key] if key 75 return @data['ID'] if @data['ID'] 76 77 part = @orig['ID'].split(/ +/) 78 if part[4].to_s.chomp == 'AA.' then 79 # after UniProtKB release 9.0 of 31-Oct-2006 80 # (http://www.uniprot.org/docs/sp_news.htm) 81 molecule_type = nil 82 sequence_length = part[3].to_i 83 else 84 molecule_type = part[3].sub(/;/,'') 85 sequence_length = part[4].to_i 86 end 87 @data['ID'] = { 88 'ENTRY_NAME' => part[1], 89 'DATA_CLASS' => part[2].sub(/;/,''), 90 'MOLECULE_TYPE' => molecule_type, 91 'SEQUENCE_LENGTH' => sequence_length 92 } 93 end
returns a MOLECULE_TYPE in the ID line.
A short-cut for Bio::UniProtKB#id_line
(‘MOLECULE_TYPE’).
# File lib/bio/db/embl/uniprotkb.rb 108 def molecule 109 id_line('MOLECULE_TYPE') 110 end
The OH Line; ¶ ↑
OH NCBI_TaxID=TaxID; HostName. br.expasy.org/sprot/userman.html#OH_line
# File lib/bio/db/embl/uniprotkb.rb 531 def oh 532 unless @data['OH'] 533 @data['OH'] = fetch('OH').split("\. ").map {|x| 534 if x =~ /NCBI_TaxID=(\d+);/ 535 taxid = $1 536 else 537 raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):", 538 $!, "\n", get('OH'), "\n"].join 539 540 end 541 if x =~ /NCBI_TaxID=\d+; (.+)/ 542 host_name = $1 543 host_name.sub!(/\.$/, '') 544 else 545 host_name = nil 546 end 547 {'NCBI_TaxID' => taxid, 'HostName' => host_name} 548 } 549 end 550 @data['OH'] 551 end
returns a Array of Hashs or a String of the OS line when a key given.
-
Bio::EMBLDB#os -> Array
[{'name' => '(Human)', 'os' => 'Homo sapiens'}, {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
-
Bio::EPTR#os -> Hash
{'name' => "(Human)", 'os' => 'Homo sapiens'}
-
Bio::UniProtKB#os[0]
[‘name’] -> “(Human)” -
Bio::EPTR#os(0) -> “Homo sapiens (Human)”
OS Line; organism species (>=1)¶ ↑
OS Genus species (name). OS Genus species (name0) (name1). OS Genus species (name0) (name1). OS Genus species (name0), G s0 (name0), and G s (name0) (name1). OS Homo sapiens (Human), and Rarrus norveticus (Rat) OS Hippotis sp. Clark and Watts 825. OS unknown cyperaceous sp.
# File lib/bio/db/embl/uniprotkb.rb 470 def os(num = nil) 471 unless @data['OS'] 472 os = Array.new 473 fetch('OS').split(/, and|, /).each do |tmp| 474 if tmp =~ /(\w+ *[\w \:\'\+\-\.]+[\w\.])/ 475 org = $1 476 tmp =~ /(\(.+\))/ 477 os.push({'name' => $1, 'os' => org}) 478 else 479 raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n" 480 end 481 end 482 @data['OS'] = os 483 end 484 485 if num 486 # EX. "Trifolium repens (white clover)" 487 return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}" 488 else 489 return @data['OS'] 490 end 491 end
returns a Hash of oraganism taxonomy cross-references.
-
Bio::UniProtKB#ox
-> Hash{'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}
OX Line; organism taxonomy cross-reference (>=1 per entry)¶ ↑
OX NCBI_TaxID=1234; OX NCBI_TaxID=1234, 2345, 3456, 4567;
# File lib/bio/db/embl/uniprotkb.rb 514 def ox 515 unless @data['OX'] 516 tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip } 517 hsh = Hash.new 518 tmp.each do |e| 519 db,refs = e.split(/=/) 520 hsh[db] = refs.split(/, */) 521 end 522 @data['OX'] = hsh 523 end 524 return @data['OX'] 525 end
returns the proposed official name of the protein. Returns a String.
Since UniProtKB
release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full name which is taken from “RecName: Full=” or “SubName: Full=” line normally in the beginning of the DE lines. Unlike parser for old format, no special treatments for fragment or precursor.
For old format, the method parses the DE lines and returns the protein name as a String.
DE Line; description (>=1)¶ ↑
"DE #{OFFICIAL_NAME} (#{SYNONYM})" "DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]." OFFICIAL_NAME 1/entry SYNONYM >=0 CONTEINS >=0
# File lib/bio/db/embl/uniprotkb.rb 250 def protein_name 251 parsed_de_line = self.de 252 if parsed_de_line.kind_of?(Array) then 253 # since UniProtKB release 14.0 of 22-Jul-2008 254 name = nil 255 parsed_de_line.each do |a| 256 case a[0] 257 when 'RecName', 'SubName' 258 if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then 259 name = name_pair[1] 260 break 261 end 262 end 263 end 264 name = name.to_s 265 else 266 # old format (before Rel. 13.x) 267 name = "" 268 if de_line = fetch('DE') then 269 str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part) 270 name = str[/^[^(]*/].strip 271 name << ' (Fragment)' if str =~ /fragment/i 272 end 273 end 274 return name 275 end
returns contents in the R lines.
-
Bio::EMBLDB::Common#ref
-> [ <refernece information Hash>* ]
where <reference information Hash> is:
{'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
R Lines
-
RN RC RP RX RA RT RL RG
# File lib/bio/db/embl/uniprotkb.rb 567 def ref 568 unless @data['R'] 569 @data['R'] = [get('R').split(/\nRN /)].flatten.map { |str| 570 hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 571 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''} 572 str = 'RN ' + str unless /^RN / =~ str 573 574 str.split("\n").each do |line| 575 if /^(R[NPXARLCTG]) (.+)/ =~ line 576 hash[$1] += $2 + ' ' 577 else 578 raise "Invalid format in R lines, \n[#{line}]\n" 579 end 580 end 581 582 hash['RN'] = set_RN(hash['RN']) 583 hash['RC'] = set_RC(hash['RC']) 584 hash['RP'] = set_RP(hash['RP']) 585 hash['RX'] = set_RX(hash['RX']) 586 hash['RA'] = set_RA(hash['RA']) 587 hash['RT'] = set_RT(hash['RT']) 588 hash['RL'] = set_RL(hash['RL']) 589 hash['RG'] = set_RG(hash['RG']) 590 591 hash 592 } 593 594 end 595 @data['R'] 596 end
returns Bio::Reference
object from Bio::EMBLDB::Common#ref
.
# File lib/bio/db/embl/uniprotkb.rb 661 def references 662 unless @data['references'] 663 ary = self.ref.map {|ent| 664 hash = Hash.new('') 665 ent.each {|key, value| 666 case key 667 when 'RA' 668 hash['authors'] = value.split(/, /) 669 when 'RT' 670 hash['title'] = value 671 when 'RL' 672 if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/ 673 hash['journal'] = $1 674 hash['volume'] = $2 675 hash['issue'] = $3 676 hash['pages'] = $4 677 hash['year'] = $5 678 else 679 hash['journal'] = value 680 end 681 when 'RX' # PUBMED, MEDLINE, DOI 682 value.each do |tag, xref| 683 hash[ tag.downcase ] = xref 684 end 685 end 686 } 687 Reference.new(hash) 688 } 689 @data['references'] = References.new(ary) 690 end 691 @data['references'] 692 end
returns a Bio::Sequence::AA
of the amino acid sequence.
blank Line; sequence data (>=1)
# File lib/bio/db/embl/uniprotkb.rb 1431 def seq 1432 unless @data[''] 1433 @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') ) 1434 end 1435 return @data[''] 1436 end
returns a SEQUENCE_LENGTH in the ID line.
A short-cut for Bio::UniProtKB#id_line
(‘SEQUENCE_LENGHT’).
# File lib/bio/db/embl/uniprotkb.rb 117 def sequence_length 118 id_line('SEQUENCE_LENGTH') 119 end
# File lib/bio/db/embl/uniprotkb.rb 598 def set_RN(data) 599 data.strip 600 end
returns a Hash of conteins in the SQ lines.
-
Bio::UniProtKBL#sq -> hsh
returns a value of a key given in the SQ lines.
-
Bio::UniProtKBL#sq(key) -> int or str
-
Keys: [‘MW’, ‘mw’, ‘molecular’, ‘weight’, ‘aalen’, ‘len’, ‘length’,
'CRC64']
SQ Line; sequence header (1/entry)¶ ↑
SQ SEQUENCE 233 AA; 25630 MW; 146A1B48A1475C86 CRC64; SQ SEQUENCE \d+ AA; \d+ MW; [0-9A-Z]+ CRC64;
MW, Dalton unit. CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).
# File lib/bio/db/embl/uniprotkb.rb 1403 def sq(key = nil) 1404 unless @data['SQ'] 1405 if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/ 1406 @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 } 1407 else 1408 raise "Invalid SQ Line: \n'#{fetch('SQ')}'" 1409 end 1410 end 1411 1412 if key 1413 case key 1414 when /mw/, /molecular/, /weight/ 1415 @data['SQ']['MW'] 1416 when /len/, /length/, /AA/ 1417 @data['SQ']['aalen'] 1418 else 1419 @data['SQ'][key] 1420 end 1421 else 1422 @data['SQ'] 1423 end 1424 end
returns synonyms (unofficial and/or alternative names). Returns an Array containing String objects.
Since UniProtKB
release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full or short names which are taken from “RecName: Short=”, “RecName: EC=”, and AltName lines, except after “Contains:” or “Includes:”. For keeping compatibility with old format parser, “RecName: EC=N.N.N.N” is reported as “EC N.N.N.N”. In addition, to prevent confusion, “Allergen=” and “CD_antigen=” prefixes are added for the corresponding fields.
For old format, the method parses the DE lines and returns synonyms. synonyms are each placed in () following the official name on the DE line.
# File lib/bio/db/embl/uniprotkb.rb 291 def synonyms 292 ary = Array.new 293 parsed_de_line = self.de 294 if parsed_de_line.kind_of?(Array) then 295 # since UniProtKB release 14.0 of 22-Jul-2008 296 parsed_de_line.each do |a| 297 case a[0] 298 when 'Includes', 'Contains' 299 break #the each loop 300 when 'RecName', 'SubName', 'AltName' 301 a[1..-1].each do |b| 302 if name = b[1] and b[1] != self.protein_name then 303 case b[0] 304 when 'EC' 305 name = "EC " + b[1] 306 when 'Allergen', 'CD_antigen' 307 name = b[0] + '=' + b[1] 308 else 309 name = b[1] 310 end 311 ary.push name 312 end 313 end 314 end #case a[0] 315 end #parsed_de_line.each 316 else 317 # old format (before Rel. 13.x) 318 if de_line = fetch('DE') then 319 line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ]. That's the "contains" part 320 line.scan(/\([^)]+/) do |synonym| 321 unless synonym =~ /fragment/i then 322 ary << synonym[1..-1].strip # index to remove the leading ( 323 end 324 end 325 end 326 end 327 return ary 328 end