class Bio::UniProtKB

Description

Parser class for UniProtKB/SwissProt and TrEMBL database entry.

See the UniProtKB document files and manuals.

Examples

str = File.read("p53_human.swiss")
obj = Bio::UniProtKB.new(str)
obj.entry_id #=> "P53_HUMAN"

References

Public Instance Methods

aalen()
Alias for: sequence_length
aaseq()
Alias for: seq
cc(topic = nil) click to toggle source

returns contents in the CC lines.

returns an object of contents in the TOPIC.

returns contents of the “ALTERNATIVE PRODUCTS”.

  • Bio::UniProtKB#cc(‘ALTERNATIVE PRODUCTS’) -> Hash

    {'Event' => str, 
     'Named isoforms' => int,  
     'Comment' => str,
     'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
    
    CC   -!- ALTERNATIVE PRODUCTS:
    CC       Event=Alternative splicing; Named isoforms=15;
    ...
    CC         placentae isoforms. All tissues differentially splice exon 13;
    CC       Name=A; Synonyms=no del;
    CC         IsoId=P15529-1; Sequence=Displayed;

returns contents of the “DATABASE”.

  • Bio::UniProtKB#cc(‘DATABASE’) -> Array

    [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
    
    CC   -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].

returns contents of the “MASS SPECTROMETRY”.

  • Bio::UniProtKB#cc(‘MASS SPECTROMETRY’) -> Array

    [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
    
    CC   -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].

CC lines (>=0, optional)

CC   -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
CC       IN LIVER, KIDNEY, LUNG AND BRAIN.

CC   -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
CC       SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.

See also www.expasy.org/sprot/userman.html#CC_line

    # File lib/bio/db/embl/uniprotkb.rb
806 def cc(topic = nil)
807   unless @data['CC']
808     cc  = Hash.new
809     comment_border= '-' * (77 - 4 + 1)
810     dlm = /-!- /
811 
812     # 12KD_MYCSM has no CC lines.
813     return cc if get('CC').size == 0
814     
815     cc_raw = fetch('CC')
816 
817     # Removing the copyright statement.
818     cc_raw.sub!(/ *---.+---/m, '')
819 
820     # Not any CC Lines without the copyright statement.
821     return cc if cc_raw == ''
822 
823     begin
824       cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0]
825       _ = copyright #dummy for suppress "assigned but unused variable"
826       cc_raw = cc_raw.sub(dlm,'')
827       cc_raw.split(dlm).each do |tmp|
828         tmp = tmp.strip
829 
830         if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
831           key  = $1
832           body = $2
833           body.gsub!(/- (?!AND)/,'-')
834           body.strip!
835           unless cc[key]
836             cc[key] = [body]
837           else
838             cc[key].push(body)
839           end
840         else
841           raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"',
842                  '', get('CC'),''].join("\n")
843         end
844       end
845     rescue NameError
846       if fetch('CC') == ''
847         return {}
848       else
849         raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
850                "\n'#{self.get('CC')}'\n", "(#{$!})"].join
851       end
852     rescue NoMethodError
853     end
854     
855     @data['CC'] = cc
856   end
857 
858 
859   case topic
860   when 'ALLERGEN'
861     return @data['CC'][topic]
862   when 'ALTERNATIVE PRODUCTS'
863     return cc_alternative_products(@data['CC'][topic])
864   when 'BIOPHYSICOCHEMICAL PROPERTIES'
865     return cc_biophysiochemical_properties(@data['CC'][topic])
866   when 'BIOTECHNOLOGY'
867     return @data['CC'][topic]
868   when 'CATALITIC ACTIVITY'
869     return cc_catalytic_activity(@data['CC'][topic])
870   when 'CAUTION'
871     return cc_caution(@data['CC'][topic])
872   when 'COFACTOR'
873     return @data['CC'][topic]
874   when 'DEVELOPMENTAL STAGE'
875     return @data['CC'][topic].join('')
876   when 'DISEASE'
877     return @data['CC'][topic].join('')
878   when 'DOMAIN'
879     return @data['CC'][topic]
880   when 'ENZYME REGULATION'
881     return @data['CC'][topic].join('')
882   when 'FUNCTION'
883     return @data['CC'][topic].join('')
884   when 'INDUCTION'
885     return @data['CC'][topic].join('')
886   when 'INTERACTION'
887     return cc_interaction(@data['CC'][topic])
888   when 'MASS SPECTROMETRY'
889     return cc_mass_spectrometry(@data['CC'][topic])
890   when 'MISCELLANEOUS'
891     return @data['CC'][topic]
892   when 'PATHWAY'
893     return cc_pathway(@data['CC'][topic])
894   when 'PHARMACEUTICAL'
895     return @data['CC'][topic]
896   when 'POLYMORPHISM'
897     return @data['CC'][topic]
898   when 'PTM'
899     return @data['CC'][topic]
900   when 'RNA EDITING'
901     return cc_rna_editing(@data['CC'][topic])
902   when 'SIMILARITY'
903     return @data['CC'][topic]
904   when 'SUBCELLULAR LOCATION'
905     return cc_subcellular_location(@data['CC'][topic])
906   when 'SUBUNIT'
907     return @data['CC'][topic]
908   when 'TISSUE SPECIFICITY'
909     return @data['CC'][topic]
910   when 'TOXIC DOSE'
911     return @data['CC'][topic]
912   when 'WEB RESOURCE'
913     return cc_web_resource(@data['CC'][topic])
914   when 'DATABASE'
915     # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
916     tmp = Array.new
917     db = @data['CC']['DATABASE']
918     return db unless db
919 
920     db.each do |e|
921       db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
922       e.sub(/.$/,'').split(/;/).each do |line|
923         case line
924         when /NAME=(.+)/
925           db['NAME'] = $1
926         when /NOTE=(.+)/
927           db['NOTE'] = $1
928         when /WWW="(.+)"/
929           db['WWW'] = $1
930         when /FTP="(.+)"/
931           db['FTP'] = $1
932         end 
933       end
934       tmp.push(db)
935     end
936     return tmp
937   when nil
938     return @data['CC']
939   else
940     return @data['CC'][topic]
941   end
942 end
de() click to toggle source

Returns an Array (for new format since rel 14) or a String (for old format before rel 14) for the DE line.

Calls superclass method Bio::EMBLDB::Common#de
    # File lib/bio/db/embl/uniprotkb.rb
333 def de
334   return @data['DE'] if @data['DE']
335   parsed_de_line = parse_DE_line_rel14(get('DE'))
336   case parsed_de_line
337   when Array # new format since rel14
338     @data['DE'] ||= parsed_de_line
339   else
340     super
341   end
342   @data['DE']
343 end
dr(key = nil) click to toggle source

Bio::UniProtKB#dr

     # File lib/bio/db/embl/uniprotkb.rb
1171 def dr(key = nil)
1172   unless key
1173     embl_dr
1174   else
1175     (embl_dr[key] or []).map {|x|
1176       {'Accession' => x[0],
1177        'Version' => x[1],
1178        ' ' => x[2],
1179        'Molecular Type' => x[3]}
1180     }
1181   end
1182 end
Also aliased as: embl_dr
dt(key = nil) click to toggle source

returns a Hash of information in the DT lines.

hash keys: 
  ['created', 'sequence', 'annotation']

Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is changed, and the word “annotation” is no longer used in DT lines. Despite the change, the word “annotation” is still used for keeping compatibility.

returns a String of information in the DT lines by a given key.

DT Line; date (3/entry)

DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.)
DT DD-MMM-YYY (sequence version NN)
DT DD-MMM-YYY (entry version NN)

The format have been changed in UniProtKB release 7.0 of 07-Feb-2006. Below is the older format.

Old format of DT Line; date (3/entry)

DT DD-MMM-YYY (rel. NN, Created)
DT DD-MMM-YYY (rel. NN, Last sequence update)
DT DD-MMM-YYY (rel. NN, Last annotation update)
    # File lib/bio/db/embl/uniprotkb.rb
157 def dt(key = nil)
158   return dt[key] if key
159   return @data['DT'] if @data['DT']
160 
161   part = self.get('DT').split(/\n/)
162   @data['DT'] = {
163     'created'    => part[0].sub(/\w{2}   /,'').strip,
164     'sequence'   => part[1].sub(/\w{2}   /,'').strip,
165     'annotation' => part[2].sub(/\w{2}   /,'').strip
166   }
167 end
embl_dr(key = nil)

Backup Bio::EMBLDB#dr as embl_dr

Alias for: dr
entry()
Alias for: entry_id
entry_id() click to toggle source

returns a ENTRY_NAME in the ID line.

    # File lib/bio/db/embl/uniprotkb.rb
 98 def entry_id
 99   id_line('ENTRY_NAME')
100 end
Also aliased as: entry_name, entry
entry_name()
Alias for: entry_id
ft(feature_key = nil) click to toggle source

returns contents in the feature table.

Examples

sp = Bio::UniProtKB.new(entry)
ft = sp.ft
ft.class #=> Hash
ft.keys.each do |feature_key|
  ft[feature_key].each do |feature|
    feature['From'] #=> '1'
    feature['To']   #=> '21'
    feature['Description'] #=> ''
    feature['FTId'] #=> ''
    feature['diff'] #=> []
    feature['original'] #=> [feature_key, '1', '21', '', '']
  end
end
  • Bio::UniProtKB#ft -> Hash

    {FEATURE_KEY => [{'From' => int, 'To' => int, 
                      'Description' => aStr, 'FTId' => aStr,
                      'diff' => [original_residues, changed_residues],
                      'original' => aAry }],...}

returns an Array of the information about the feature_name in the feature table.

FT Line; feature table data (>=0, optional)

Col     Data item
-----   -----------------
 1- 2   FT
 6-13   Feature name 
15-20   `FROM' endpoint
22-27   `TO' endpoint
35-75   Description (>=0 per key)
-----   -----------------

Note: ‘FROM’ and ‘TO’ endopoints are allowed to use non-numerial charactors including ‘<’, ‘>’ or ‘?’. (c.f. ‘<1’, ‘?42’)

See also www.expasy.org/sprot/userman.html#FT_line

     # File lib/bio/db/embl/uniprotkb.rb
1236 def ft(feature_key = nil)
1237   return ft[feature_key] if feature_key
1238   return @data['FT'] if @data['FT']
1239 
1240   ftstr = get('FT')
1241   ftlines = ftstr.split("\n")
1242   for i in 0..10 do
1243     if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ ftlines[i] &&
1244        /^FT +\/([^\s\=]+)(?:\=(\")?(.+)(\")?)?\s*$/ =~ ftlines[i+1] then
1245       fmt_2019_11 = true
1246       break #for i
1247     end
1248   end #for i
1249 
1250   hash = if fmt_2019_11 then
1251            ft_2019_11_parser(ftlines)
1252          else
1253            ft_legacy_parser(ftlines)
1254          end
1255   @data['FT'] = hash
1256 end
gene_name() click to toggle source

returns a String of the first gene name in the GN line.

    # File lib/bio/db/embl/uniprotkb.rb
448 def gene_name
449   (x = self.gene_names) ? x.first : nil
450 end
gene_names() click to toggle source

returns a Array of gene names in the GN line.

    # File lib/bio/db/embl/uniprotkb.rb
437 def gene_names
438   gn # set @data['GN'] if it hasn't been already done
439   if @data['GN'].first.class == Hash then
440     @data['GN'].collect { |element| element[:name] }
441   else
442     @data['GN'].first
443   end
444 end
gn() click to toggle source

returns gene names in the GN line.

New UniProt/SwissProt format:

where <gene record> is:

{ :name => '...', 
  :synonyms => [ 's1', 's2', ... ],
  :loci   => [ 'l1', 'l2', ... ],
  :orfs     => [ 'o1', 'o2', ... ] 
}

Old format:

GN Line: Gene name(s) (>=0, optional)

    # File lib/bio/db/embl/uniprotkb.rb
361 def gn
362   unless @data['GN']
363     case fetch('GN')
364     when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/
365       @data['GN'] = gn_uniprot_parser
366     else
367       @data['GN'] = gn_old_parser
368     end
369   end
370   @data['GN']
371 end
hi() click to toggle source

The HI line

Bio::UniProtKB#hi #=> hash

    # File lib/bio/db/embl/uniprotkb.rb
722 def hi
723   unless @data['HI']
724     @data['HI'] = []
725     fetch('HI').split(/\. /).each do |hlist|
726       hash = {'Category' => '',  'Keywords' => [], 'Keyword' => ''}
727       hash['Category'], hash['Keywords'] = hlist.split(': ')
728       hash['Keywords'] = hash['Keywords'].split('; ')
729       hash['Keyword'] = hash['Keywords'].pop
730       hash['Keyword'].sub!(/\.$/, '')
731       @data['HI'] << hash
732     end
733   end
734   @data['HI']
735 end
id_line(key = nil) click to toggle source

returns a Hash of the ID line.

returns a content (Int or String) of the ID line by a given key. Hash keys: [‘ENTRY_NAME’, ‘DATA_CLASS’, ‘MODECULE_TYPE’, ‘SEQUENCE_LENGTH’]

ID Line (since UniProtKB release 9.0 of 31-Oct-2006)

ID   P53_HUMAN               Reviewed;         393 AA.
#"ID  #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."

Examples

obj.id_line  #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed", 
                  "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil}

obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"

ID Line (older style)

ID   P53_HUMAN      STANDARD;      PRT;   393 AA.
#"ID  #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."

Examples

obj.id_line  #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD", 
                  "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"}

obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
   # File lib/bio/db/embl/uniprotkb.rb
73 def id_line(key = nil)
74   return id_line[key] if key
75   return @data['ID'] if @data['ID']
76 
77   part = @orig['ID'].split(/ +/)         
78   if part[4].to_s.chomp == 'AA.' then
79     # after UniProtKB release 9.0 of 31-Oct-2006
80     # (http://www.uniprot.org/docs/sp_news.htm)
81     molecule_type   = nil
82     sequence_length = part[3].to_i
83   else
84     molecule_type   = part[3].sub(/;/,'')
85     sequence_length = part[4].to_i
86   end
87   @data['ID'] = {
88     'ENTRY_NAME'      => part[1],
89     'DATA_CLASS'      => part[2].sub(/;/,''),
90     'MOLECULE_TYPE'   => molecule_type,
91     'SEQUENCE_LENGTH' => sequence_length
92   }
93 end
molecule() click to toggle source

returns a MOLECULE_TYPE in the ID line.

A short-cut for Bio::UniProtKB#id_line(‘MOLECULE_TYPE’).

    # File lib/bio/db/embl/uniprotkb.rb
108 def molecule
109   id_line('MOLECULE_TYPE')
110 end
Also aliased as: molecule_type
molecule_type()
Alias for: molecule
oh() click to toggle source

The OH Line;

OH NCBI_TaxID=TaxID; HostName. br.expasy.org/sprot/userman.html#OH_line

    # File lib/bio/db/embl/uniprotkb.rb
531 def oh
532   unless @data['OH']
533     oh = []
534     a = fetch('OH').split(/(NCBI\_TaxID\=)(\d+)(\;)/)
535     t = catch :error do
536       taxid = nil
537       host_name = nil
538       while x = a.shift
539         x = x.to_s.strip
540         case x
541         when ''
542           next
543         when 'NCBI_TaxID='
544           if taxid then
545             oh.push({'NCBI_TaxID' => taxid, 'HostName' => host_name})
546             taxid = nil
547             host_name = nil
548           end
549           taxid = a.shift
550           throw :error, :missing_semicolon if a.shift != ';'
551         else
552           throw :error, :missing_taxid if host_name
553           host_name = x
554           host_name.sub!(/\.\z/, '')
555         end
556       end #while x...
557       if taxid then
558         oh.push({'NCBI_TaxID' => taxid, 'HostName' => host_name})
559       elsif host_name then
560         throw :error, :missing_taxid_last
561       end
562       nil
563     end #t = catch...
564     if t then
565       raise ArgumentError,
566             ["Error: Invalid OH line format (#{self.entry_id}):",
567              $!, "\n", get('OH'), "\n"].join
568     end
569     @data['OH'] = oh
570   end
571   @data['OH']
572 end
os(num = nil) click to toggle source

returns a Array of Hashs or a String of the OS line when a key given.

  • Bio::EMBLDB#os -> Array

[{'name' => '(Human)', 'os' => 'Homo sapiens'}, 
 {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
{'name' => "(Human)", 'os' => 'Homo sapiens'}

OS Line; organism species (>=1)

OS   Genus species (name).
OS   Genus species (name0) (name1).
OS   Genus species (name0) (name1).
OS   Genus species (name0), G s0 (name0), and G s (name0) (name1).
OS   Homo sapiens (Human), and Rarrus norveticus (Rat)
OS   Hippotis sp. Clark and Watts 825.
OS   unknown cyperaceous sp.
    # File lib/bio/db/embl/uniprotkb.rb
470 def os(num = nil)
471   unless @data['OS']
472     os = Array.new
473     fetch('OS').split(/, and|, /).each do |tmp|
474       if tmp =~ /(\w+ *[\w \:\'\+\-\.]+[\w\.])/
475         org = $1
476         tmp =~ /(\(.+\))/ 
477         os.push({'name' => $1, 'os' => org})
478       else
479         raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
480       end
481     end
482     @data['OS'] = os
483   end
484 
485   if num
486     # EX. "Trifolium repens (white clover)"
487     return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
488   else
489     return @data['OS']
490   end
491 end
ox() click to toggle source

returns a Hash of oraganism taxonomy cross-references.

OX Line; organism taxonomy cross-reference (>=1 per entry)

OX   NCBI_TaxID=1234;
OX   NCBI_TaxID=1234, 2345, 3456, 4567;
    # File lib/bio/db/embl/uniprotkb.rb
514 def ox
515   unless @data['OX']
516     tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
517     hsh = Hash.new
518     tmp.each do |e|
519       db,refs = e.split(/=/)
520       hsh[db] = refs.split(/, */)
521     end
522     @data['OX'] = hsh
523   end
524   return @data['OX']
525 end
protein_name() click to toggle source

returns the proposed official name of the protein. Returns a String.

Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full name which is taken from “RecName: Full=” or “SubName: Full=” line normally in the beginning of the DE lines. Unlike parser for old format, no special treatments for fragment or precursor.

For old format, the method parses the DE lines and returns the protein name as a String.

DE Line; description (>=1)

"DE #{OFFICIAL_NAME} (#{SYNONYM})"
"DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
OFFICIAL_NAME  1/entry
SYNONYM        >=0
CONTEINS       >=0
    # File lib/bio/db/embl/uniprotkb.rb
250 def protein_name
251   parsed_de_line = self.de
252   if parsed_de_line.kind_of?(Array) then
253     # since UniProtKB release 14.0 of 22-Jul-2008
254     name = nil
255     parsed_de_line.each do |a|
256       case a[0]
257       when 'RecName', 'SubName'
258         if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then
259           name = name_pair[1]
260           break
261         end
262       end
263     end
264     name = name.to_s
265   else
266     # old format (before Rel. 13.x)
267     name = ""
268     if de_line = fetch('DE') then
269       str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
270       name = str[/^[^(]*/].strip
271       name << ' (Fragment)' if str =~ /fragment/i
272     end
273   end
274   return name
275 end
ref() click to toggle source

returns contents in the R lines.

where <reference information Hash> is:

{'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 
 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}

R Lines

  • RN RC RP RX RA RT RL RG

    # File lib/bio/db/embl/uniprotkb.rb
588 def ref
589   unless @data['R']
590     @data['R'] = [get('R').split(/\nRN   /)].flatten.map { |str|
591       hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 
592              'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
593       str = 'RN   ' + str unless /^RN   / =~ str
594 
595       str.split("\n").each do |line|
596         if /^(R[NPXARLCTG])   (.+)/ =~ line
597           hash[$1] += $2 + ' '
598         else
599           raise "Invalid format in R lines, \n[#{line}]\n"
600         end
601       end
602 
603       hash['RN'] = set_RN(hash['RN'])
604       hash['RC'] = set_RC(hash['RC'])
605       hash['RP'] = set_RP(hash['RP'])
606       hash['RX'] = set_RX(hash['RX'])
607       hash['RA'] = set_RA(hash['RA'])
608       hash['RT'] = set_RT(hash['RT'])
609       hash['RL'] = set_RL(hash['RL'])
610       hash['RG'] = set_RG(hash['RG'])
611 
612       hash
613     }
614 
615   end
616   @data['R']
617 end
references() click to toggle source

returns Bio::Reference object from Bio::EMBLDB::Common#ref.

    # File lib/bio/db/embl/uniprotkb.rb
682 def references
683   unless @data['references']
684     ary = self.ref.map {|ent|
685       hash = Hash.new('')
686       ent.each {|key, value|
687         case key
688         when 'RA'
689           hash['authors'] = value.split(/, /)
690         when 'RT'
691           hash['title'] = value
692         when 'RL'
693           if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
694             hash['journal'] = $1
695             hash['volume']  = $2
696             hash['issue']   = $3
697             hash['pages']   = $4
698             hash['year']    = $5
699           else
700             hash['journal'] = value
701           end
702         when 'RX'  # PUBMED, MEDLINE, DOI
703           value.each do |tag, xref|
704             hash[ tag.downcase ]  = xref
705           end
706         end
707       }
708       Reference.new(hash)
709     }
710     @data['references'] = References.new(ary)
711   end
712   @data['references']
713 end
seq() click to toggle source

returns a Bio::Sequence::AA of the amino acid sequence.

blank Line; sequence data (>=1)

     # File lib/bio/db/embl/uniprotkb.rb
1464 def seq
1465   unless @data['']
1466     @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
1467   end
1468   return @data['']
1469 end
Also aliased as: aaseq
sequence_length() click to toggle source

returns a SEQUENCE_LENGTH in the ID line.

A short-cut for Bio::UniProtKB#id_line(‘SEQUENCE_LENGHT’).

    # File lib/bio/db/embl/uniprotkb.rb
117 def sequence_length
118   id_line('SEQUENCE_LENGTH')
119 end
Also aliased as: aalen
set_RN(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
619 def set_RN(data)
620   data.strip
621 end
sq(key = nil) click to toggle source

returns a Hash of conteins in the SQ lines.

  • Bio::UniProtKBL#sq -> hsh

returns a value of a key given in the SQ lines.

  • Bio::UniProtKBL#sq(key) -> int or str

  • Keys: [‘MW’, ‘mw’, ‘molecular’, ‘weight’, ‘aalen’, ‘len’, ‘length’,

    'CRC64']

SQ Line; sequence header (1/entry)

SQ   SEQUENCE   233 AA;  25630 MW;  146A1B48A1475C86 CRC64;
SQ   SEQUENCE  \d+ AA; \d+ MW;  [0-9A-Z]+ CRC64;

MW, Dalton unit. CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).

     # File lib/bio/db/embl/uniprotkb.rb
1436 def sq(key = nil)
1437   unless @data['SQ']
1438     if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
1439       @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
1440     else
1441       raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
1442     end
1443   end
1444 
1445   if key
1446     case key
1447     when /mw/, /molecular/, /weight/
1448       @data['SQ']['MW']
1449     when /len/, /length/, /AA/
1450       @data['SQ']['aalen']
1451     else
1452       @data['SQ'][key]
1453     end
1454   else 
1455     @data['SQ']
1456   end
1457 end
synonyms() click to toggle source

returns synonyms (unofficial and/or alternative names). Returns an Array containing String objects.

Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full or short names which are taken from “RecName: Short=”, “RecName: EC=”, and AltName lines, except after “Contains:” or “Includes:”. For keeping compatibility with old format parser, “RecName: EC=N.N.N.N” is reported as “EC N.N.N.N”. In addition, to prevent confusion, “Allergen=” and “CD_antigen=” prefixes are added for the corresponding fields.

For old format, the method parses the DE lines and returns synonyms. synonyms are each placed in () following the official name on the DE line.

    # File lib/bio/db/embl/uniprotkb.rb
291 def synonyms
292   ary = Array.new
293   parsed_de_line = self.de
294   if parsed_de_line.kind_of?(Array) then
295     # since UniProtKB release 14.0 of 22-Jul-2008
296     parsed_de_line.each do |a|
297       case a[0]
298       when 'Includes', 'Contains'
299         break #the each loop
300       when 'RecName', 'SubName', 'AltName'
301         a[1..-1].each do |b|
302           if name = b[1] and b[1] != self.protein_name then
303             case b[0]
304             when 'EC'
305               name = "EC " + b[1]
306             when 'Allergen', 'CD_antigen'
307               name = b[0] + '=' + b[1]
308             else
309               name = b[1]
310             end
311             ary.push name
312           end
313         end
314       end #case a[0]
315     end #parsed_de_line.each
316   else
317     # old format (before Rel. 13.x)
318     if de_line = fetch('DE') then
319       line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ].  That's the "contains" part
320     line.scan(/\([^)]+/) do |synonym| 
321       unless synonym =~ /fragment/i then 
322         ary << synonym[1..-1].strip # index to remove the leading (
323       end
324       end
325     end
326   end
327   return ary
328 end