class Bio::UniProtKB

Description

Parser class for UniProtKB/SwissProt and TrEMBL database entry.

See the UniProtKB document files and manuals.

Examples

str = File.read("p53_human.swiss")
obj = Bio::UniProtKB.new(str)
obj.entry_id #=> "P53_HUMAN"

References

Public Instance Methods

aalen()
Alias for: sequence_length
aaseq()
Alias for: seq
cc(topic = nil) click to toggle source

returns contents in the CC lines.

returns an object of contents in the TOPIC.

returns contents of the “ALTERNATIVE PRODUCTS”.

  • Bio::UniProtKB#cc('ALTERNATIVE PRODUCTS') -> Hash

    {'Event' => str, 
     'Named isoforms' => int,  
     'Comment' => str,
     'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
    
    CC   -!- ALTERNATIVE PRODUCTS:
    CC       Event=Alternative splicing; Named isoforms=15;
    ...
    CC         placentae isoforms. All tissues differentially splice exon 13;
    CC       Name=A; Synonyms=no del;
    CC         IsoId=P15529-1; Sequence=Displayed;

returns contents of the “DATABASE”.

  • Bio::UniProtKB#cc('DATABASE') -> Array

    [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
    
    CC   -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].

returns contents of the “MASS SPECTROMETRY”.

  • Bio::UniProtKB#cc('MASS SPECTROMETRY') -> Array

    [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
    
    CC   -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].

CC lines (>=0, optional)

CC   -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
CC       IN LIVER, KIDNEY, LUNG AND BRAIN.

CC   -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
CC       SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.

See also www.expasy.org/sprot/userman.html#CC_line

    # File lib/bio/db/embl/uniprotkb.rb
774 def cc(topic = nil)
775   unless @data['CC']
776     cc  = Hash.new
777     comment_border= '-' * (77 - 4 + 1)
778     dlm = /-!- /
779 
780     # 12KD_MYCSM has no CC lines.
781     return cc if get('CC').size == 0
782     
783     cc_raw = fetch('CC')
784 
785     # Removing the copyright statement.
786     cc_raw.sub!(/ *---.+---/m, '')
787 
788     # Not any CC Lines without the copyright statement.
789     return cc if cc_raw == ''
790 
791     begin
792       cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0]
793       _ = copyright #dummy for suppress "assigned but unused variable"
794       cc_raw = cc_raw.sub(dlm,'')
795       cc_raw.split(dlm).each do |tmp|
796         tmp = tmp.strip
797 
798         if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
799           key  = $1
800           body = $2
801           body.gsub!(/- (?!AND)/,'-')
802           body.strip!
803           unless cc[key]
804             cc[key] = [body]
805           else
806             cc[key].push(body)
807           end
808         else
809           raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"',
810                  '', get('CC'),''].join("\n")
811         end
812       end
813     rescue NameError
814       if fetch('CC') == ''
815         return {}
816       else
817         raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
818                "\n'#{self.get('CC')}'\n", "(#{$!})"].join
819       end
820     rescue NoMethodError
821     end
822     
823     @data['CC'] = cc
824   end
825 
826 
827   case topic
828   when 'ALLERGEN'
829     return @data['CC'][topic]
830   when 'ALTERNATIVE PRODUCTS'
831     return cc_alternative_products(@data['CC'][topic])
832   when 'BIOPHYSICOCHEMICAL PROPERTIES'
833     return cc_biophysiochemical_properties(@data['CC'][topic])
834   when 'BIOTECHNOLOGY'
835     return @data['CC'][topic]
836   when 'CATALITIC ACTIVITY'
837     return cc_catalytic_activity(@data['CC'][topic])
838   when 'CAUTION'
839     return cc_caution(@data['CC'][topic])
840   when 'COFACTOR'
841     return @data['CC'][topic]
842   when 'DEVELOPMENTAL STAGE'
843     return @data['CC'][topic].join('')
844   when 'DISEASE'
845     return @data['CC'][topic].join('')
846   when 'DOMAIN'
847     return @data['CC'][topic]
848   when 'ENZYME REGULATION'
849     return @data['CC'][topic].join('')
850   when 'FUNCTION'
851     return @data['CC'][topic].join('')
852   when 'INDUCTION'
853     return @data['CC'][topic].join('')
854   when 'INTERACTION'
855     return cc_interaction(@data['CC'][topic])
856   when 'MASS SPECTROMETRY'
857     return cc_mass_spectrometry(@data['CC'][topic])
858   when 'MISCELLANEOUS'
859     return @data['CC'][topic]
860   when 'PATHWAY'
861     return cc_pathway(@data['CC'][topic])
862   when 'PHARMACEUTICAL'
863     return @data['CC'][topic]
864   when 'POLYMORPHISM'
865     return @data['CC'][topic]
866   when 'PTM'
867     return @data['CC'][topic]
868   when 'RNA EDITING'
869     return cc_rna_editing(@data['CC'][topic])
870   when 'SIMILARITY'
871     return @data['CC'][topic]
872   when 'SUBCELLULAR LOCATION'
873     return cc_subcellular_location(@data['CC'][topic])
874   when 'SUBUNIT'
875     return @data['CC'][topic]
876   when 'TISSUE SPECIFICITY'
877     return @data['CC'][topic]
878   when 'TOXIC DOSE'
879     return @data['CC'][topic]
880   when 'WEB RESOURCE'
881     return cc_web_resource(@data['CC'][topic])
882   when 'DATABASE'
883     # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
884     tmp = Array.new
885     db = @data['CC']['DATABASE']
886     return db unless db
887 
888     db.each do |e|
889       db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
890       e.sub(/.$/,'').split(/;/).each do |line|
891         case line
892         when /NAME=(.+)/
893           db['NAME'] = $1
894         when /NOTE=(.+)/
895           db['NOTE'] = $1
896         when /WWW="(.+)"/
897           db['WWW'] = $1
898         when /FTP="(.+)"/
899           db['FTP'] = $1
900         end 
901       end
902       tmp.push(db)
903     end
904     return tmp
905   when nil
906     return @data['CC']
907   else
908     return @data['CC'][topic]
909   end
910 end
dr(key = nil) click to toggle source

Bio::UniProtKB#dr

     # File lib/bio/db/embl/uniprotkb.rb
1131 def dr(key = nil)
1132   unless key
1133     embl_dr
1134   else
1135     (embl_dr[key] or []).map {|x|
1136       {'Accession' => x[0],
1137        'Version' => x[1],
1138        ' ' => x[2],
1139        'Molecular Type' => x[3]}
1140     }
1141   end
1142 end
Also aliased as: embl_dr
dt(key = nil) click to toggle source

returns a Hash of information in the DT lines.

hash keys: 
  ['created', 'sequence', 'annotation']

Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is changed, and the word “annotation” is no longer used in DT lines. Despite the change, the word “annotation” is still used for keeping compatibility.

returns a String of information in the DT lines by a given key.

DT Line; date (3/entry)

DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.)
DT DD-MMM-YYY (sequence version NN)
DT DD-MMM-YYY (entry version NN)

The format have been changed in UniProtKB release 7.0 of 07-Feb-2006. Below is the older format.

Old format of DT Line; date (3/entry)

DT DD-MMM-YYY (rel. NN, Created)
DT DD-MMM-YYY (rel. NN, Last sequence update)
DT DD-MMM-YYY (rel. NN, Last annotation update)
    # File lib/bio/db/embl/uniprotkb.rb
157 def dt(key = nil)
158   return dt[key] if key
159   return @data['DT'] if @data['DT']
160 
161   part = self.get('DT').split(/\n/)
162   @data['DT'] = {
163     'created'    => part[0].sub(/\w{2}   /,'').strip,
164     'sequence'   => part[1].sub(/\w{2}   /,'').strip,
165     'annotation' => part[2].sub(/\w{2}   /,'').strip
166   }
167 end
embl_dr(key = nil)

Backup Bio::EMBLDB#dr as embl_dr

Alias for: dr
entry()
Alias for: entry_id
entry_id() click to toggle source

returns a ENTRY_NAME in the ID line.

    # File lib/bio/db/embl/uniprotkb.rb
 98 def entry_id
 99   id_line('ENTRY_NAME')
100 end
Also aliased as: entry_name, entry
entry_name()
Alias for: entry_id
ft(feature_key = nil) click to toggle source

returns contents in the feature table.

Examples

sp = Bio::UniProtKB.new(entry)
ft = sp.ft
ft.class #=> Hash
ft.keys.each do |feature_key|
  ft[feature_key].each do |feature|
    feature['From'] #=> '1'
    feature['To']   #=> '21'
    feature['Description'] #=> ''
    feature['FTId'] #=> ''
    feature['diff'] #=> []
    feature['original'] #=> [feature_key, '1', '21', '', '']
  end
end
  • Bio::UniProtKB#ft -> Hash

    {FEATURE_KEY => [{'From' => int, 'To' => int, 
                      'Description' => aStr, 'FTId' => aStr,
                      'diff' => [original_residues, changed_residues],
                      'original' => aAry }],...}

returns an Array of the information about the feature_name in the feature table.

FT Line; feature table data (>=0, optional)

Col     Data item
-----   -----------------
 1- 2   FT
 6-13   Feature name 
15-20   `FROM' endpoint
22-27   `TO' endpoint
35-75   Description (>=0 per key)
-----   -----------------

Note: 'FROM' and 'TO' endopoints are allowed to use non-numerial charactors including '<', '>' or '?'. (c.f. '<1', '?42')

See also www.expasy.org/sprot/userman.html#FT_line

     # File lib/bio/db/embl/uniprotkb.rb
1196 def ft(feature_key = nil)
1197   return ft[feature_key] if feature_key
1198   return @data['FT'] if @data['FT']
1199 
1200   table = []
1201   begin
1202     get('FT').split("\n").each do |line|
1203       if line =~ /^FT   \w/
1204         feature = line.chomp.ljust(74)
1205         table << [feature[ 5..12].strip,   # Feature Name
1206                   feature[14..19].strip,   # From
1207                   feature[21..26].strip,   # To
1208                   feature[34..74].strip ]  # Description
1209       else
1210         table.last << line.chomp.sub!(/^FT +/, '')
1211       end
1212     end
1213 
1214     # Joining Description lines
1215     table = table.map { |feature| 
1216       ftid = feature.pop if feature.last =~ /FTId=/
1217       if feature.size > 4
1218         feature = [feature[0], 
1219                    feature[1], 
1220                    feature[2], 
1221                    feature[3, feature.size - 3].join(" ")]
1222       end
1223       feature << if ftid then ftid else '' end
1224     }
1225 
1226     hash = {}
1227     table.each do |feature|
1228       hash[feature[0]] = [] unless hash[feature[0]]
1229       hash[feature[0]] << {
1230         # Removing '<', '>' or '?' in FROM/TO endopoint.
1231         'From' => feature[1].sub(/\D/, '').to_i,  
1232         'To'   => feature[2].sub(/\D/, '').to_i, 
1233         'Description' => feature[3], 
1234         'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''),
1235         'diff' => [],
1236         'original' => feature
1237       }
1238 
1239       case feature[0]
1240       when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
1241         case hash[feature[0]].last['Description']
1242         when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
1243           original_res = $1
1244           changed_res = $2
1245           original_res = original_res.gsub(/ /,'').strip
1246           chenged_res = changed_res.gsub(/ /,'').strip
1247         when /Missing/i
1248           original_res = seq.subseq(hash[feature[0]].last['From'],
1249                                     hash[feature[0]].last['To'])
1250           changed_res = ''
1251         end
1252         hash[feature[0]].last['diff'] = [original_res, chenged_res]
1253       end
1254     end
1255   rescue
1256     raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
1257   end
1258 
1259   @data['FT'] = hash
1260 end
gene_name() click to toggle source

returns a String of the first gene name in the GN line.

    # File lib/bio/db/embl/uniprotkb.rb
437 def gene_name
438   (x = self.gene_names) ? x.first : nil
439 end
gene_names() click to toggle source

returns a Array of gene names in the GN line.

    # File lib/bio/db/embl/uniprotkb.rb
426 def gene_names
427   gn # set @data['GN'] if it hasn't been already done
428   if @data['GN'].first.class == Hash then
429     @data['GN'].collect { |element| element[:name] }
430   else
431     @data['GN'].first
432   end
433 end
gn() click to toggle source

returns gene names in the GN line.

New UniProt/SwissProt format:

where <gene record> is:

{ :name => '...', 
  :synonyms => [ 's1', 's2', ... ],
  :loci   => [ 'l1', 'l2', ... ],
  :orfs     => [ 'o1', 'o2', ... ] 
}

Old format:

GN Line: Gene name(s) (>=0, optional)

    # File lib/bio/db/embl/uniprotkb.rb
350 def gn
351   unless @data['GN']
352     case fetch('GN')
353     when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/
354       @data['GN'] = gn_uniprot_parser
355     else
356       @data['GN'] = gn_old_parser
357     end
358   end
359   @data['GN']
360 end
hi() click to toggle source

The HI line

Bio::UniProtKB#hi #=> hash

    # File lib/bio/db/embl/uniprotkb.rb
690 def hi
691   unless @data['HI']
692     @data['HI'] = []
693     fetch('HI').split(/\. /).each do |hlist|
694       hash = {'Category' => '',  'Keywords' => [], 'Keyword' => ''}
695       hash['Category'], hash['Keywords'] = hlist.split(': ')
696       hash['Keywords'] = hash['Keywords'].split('; ')
697       hash['Keyword'] = hash['Keywords'].pop
698       hash['Keyword'].sub!(/\.$/, '')
699       @data['HI'] << hash
700     end
701   end
702   @data['HI']
703 end
id_line(key = nil) click to toggle source

returns a Hash of the ID line.

returns a content (Int or String) of the ID line by a given key. Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']

ID Line (since UniProtKB release 9.0 of 31-Oct-2006)

ID   P53_HUMAN               Reviewed;         393 AA.
#"ID  #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."

Examples

obj.id_line  #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed", 
                  "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil}

obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"

ID Line (older style)

ID   P53_HUMAN      STANDARD;      PRT;   393 AA.
#"ID  #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."

Examples

obj.id_line  #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD", 
                  "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"}

obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
   # File lib/bio/db/embl/uniprotkb.rb
73 def id_line(key = nil)
74   return id_line[key] if key
75   return @data['ID'] if @data['ID']
76 
77   part = @orig['ID'].split(/ +/)         
78   if part[4].to_s.chomp == 'AA.' then
79     # after UniProtKB release 9.0 of 31-Oct-2006
80     # (http://www.uniprot.org/docs/sp_news.htm)
81     molecule_type   = nil
82     sequence_length = part[3].to_i
83   else
84     molecule_type   = part[3].sub(/;/,'')
85     sequence_length = part[4].to_i
86   end
87   @data['ID'] = {
88     'ENTRY_NAME'      => part[1],
89     'DATA_CLASS'      => part[2].sub(/;/,''),
90     'MOLECULE_TYPE'   => molecule_type,
91     'SEQUENCE_LENGTH' => sequence_length
92   }
93 end
molecule() click to toggle source

returns a MOLECULE_TYPE in the ID line.

A short-cut for Bio::UniProtKB#id_line('MOLECULE_TYPE').

    # File lib/bio/db/embl/uniprotkb.rb
108 def molecule
109   id_line('MOLECULE_TYPE')
110 end
Also aliased as: molecule_type
molecule_type()
Alias for: molecule
oh() click to toggle source

The OH Line;

OH NCBI_TaxID=TaxID; HostName. br.expasy.org/sprot/userman.html#OH_line

    # File lib/bio/db/embl/uniprotkb.rb
520 def oh
521   unless @data['OH']
522     @data['OH'] = fetch('OH').split("\. ").map {|x|
523       if x =~ /NCBI_TaxID=(\d+);/
524         taxid = $1
525       else
526         raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):",
527                               $!, "\n", get('OH'), "\n"].join
528         
529       end
530       if x =~ /NCBI_TaxID=\d+; (.+)/ 
531         host_name = $1
532         host_name.sub!(/\.$/, '')
533       else
534         host_name = nil
535       end
536       {'NCBI_TaxID' => taxid, 'HostName' => host_name}
537     }
538   end
539   @data['OH']
540 end
os(num = nil) click to toggle source

returns a Array of Hashs or a String of the OS line when a key given.

  • Bio::EMBLDB#os -> Array

[{'name' => '(Human)', 'os' => 'Homo sapiens'}, 
 {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
{'name' => "(Human)", 'os' => 'Homo sapiens'}
  • Bio::UniProtKB#os['name'] -> “(Human)”

  • Bio::EPTR#os(0) -> “Homo sapiens (Human)”

OS Line; organism species (>=1)

OS   Genus species (name).
OS   Genus species (name0) (name1).
OS   Genus species (name0) (name1).
OS   Genus species (name0), G s0 (name0), and G s (name0) (name1).
OS   Homo sapiens (Human), and Rarrus norveticus (Rat)
OS   Hippotis sp. Clark and Watts 825.
OS   unknown cyperaceous sp.
    # File lib/bio/db/embl/uniprotkb.rb
459 def os(num = nil)
460   unless @data['OS']
461     os = Array.new
462     fetch('OS').split(/, and|, /).each do |tmp|
463       if tmp =~ /(\w+ *[\w \:\'\+\-\.]+[\w\.])/
464         org = $1
465         tmp =~ /(\(.+\))/ 
466         os.push({'name' => $1, 'os' => org})
467       else
468         raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
469       end
470     end
471     @data['OS'] = os
472   end
473 
474   if num
475     # EX. "Trifolium repens (white clover)"
476     return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
477   else
478     return @data['OS']
479   end
480 end
ox() click to toggle source

returns a Hash of oraganism taxonomy cross-references.

OX Line; organism taxonomy cross-reference (>=1 per entry)

OX   NCBI_TaxID=1234;
OX   NCBI_TaxID=1234, 2345, 3456, 4567;
    # File lib/bio/db/embl/uniprotkb.rb
503 def ox
504   unless @data['OX']
505     tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
506     hsh = Hash.new
507     tmp.each do |e|
508       db,refs = e.split(/=/)
509       hsh[db] = refs.split(/, */)
510     end
511     @data['OX'] = hsh
512   end
513   return @data['OX']
514 end
protein_name() click to toggle source

returns the proposed official name of the protein. Returns a String.

Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full name which is taken from “RecName: Full=” or “SubName: Full=” line normally in the beginning of the DE lines. Unlike parser for old format, no special treatments for fragment or precursor.

For old format, the method parses the DE lines and returns the protein name as a String.

DE Line; description (>=1)

"DE #{OFFICIAL_NAME} (#{SYNONYM})"
"DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
OFFICIAL_NAME  1/entry
SYNONYM        >=0
CONTEINS       >=0
    # File lib/bio/db/embl/uniprotkb.rb
250 def protein_name
251   @data['DE'] ||= parse_DE_line_rel14(get('DE'))
252   parsed_de_line = @data['DE']
253   if parsed_de_line then
254     # since UniProtKB release 14.0 of 22-Jul-2008
255     name = nil
256     parsed_de_line.each do |a|
257       case a[0]
258       when 'RecName', 'SubName'
259         if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then
260           name = name_pair[1]
261           break
262         end
263       end
264     end
265     name = name.to_s
266   else
267     # old format (before Rel. 13.x)
268     name = ""
269     if de_line = fetch('DE') then
270       str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
271       name = str[/^[^(]*/].strip
272       name << ' (Fragment)' if str =~ /fragment/i
273     end
274   end
275   return name
276 end
ref() click to toggle source

returns contents in the R lines.

where <reference information Hash> is:

{'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 
 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}

R Lines

  • RN RC RP RX RA RT RL RG

    # File lib/bio/db/embl/uniprotkb.rb
556 def ref
557   unless @data['R']
558     @data['R'] = [get('R').split(/\nRN   /)].flatten.map { |str|
559       hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 
560              'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
561       str = 'RN   ' + str unless /^RN   / =~ str
562 
563       str.split("\n").each do |line|
564         if /^(R[NPXARLCTG])   (.+)/ =~ line
565           hash[$1] += $2 + ' '
566         else
567           raise "Invalid format in R lines, \n[#{line}]\n"
568         end
569       end
570 
571       hash['RN'] = set_RN(hash['RN'])
572       hash['RC'] = set_RC(hash['RC'])
573       hash['RP'] = set_RP(hash['RP'])
574       hash['RX'] = set_RX(hash['RX'])
575       hash['RA'] = set_RA(hash['RA'])
576       hash['RT'] = set_RT(hash['RT'])
577       hash['RL'] = set_RL(hash['RL'])
578       hash['RG'] = set_RG(hash['RG'])
579 
580       hash
581     }
582 
583   end
584   @data['R']
585 end
references() click to toggle source

returns Bio::Reference object from Bio::EMBLDB::Common#ref.

    # File lib/bio/db/embl/uniprotkb.rb
650 def references
651   unless @data['references']
652     ary = self.ref.map {|ent|
653       hash = Hash.new('')
654       ent.each {|key, value|
655         case key
656         when 'RA'
657           hash['authors'] = value.split(/, /)
658         when 'RT'
659           hash['title'] = value
660         when 'RL'
661           if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
662             hash['journal'] = $1
663             hash['volume']  = $2
664             hash['issue']   = $3
665             hash['pages']   = $4
666             hash['year']    = $5
667           else
668             hash['journal'] = value
669           end
670         when 'RX'  # PUBMED, MEDLINE, DOI
671           value.each do |tag, xref|
672             hash[ tag.downcase ]  = xref
673           end
674         end
675       }
676       Reference.new(hash)
677     }
678     @data['references'] = References.new(ary)
679   end
680   @data['references']
681 end
seq() click to toggle source

returns a Bio::Sequence::AA of the amino acid sequence.

blank Line; sequence data (>=1)

     # File lib/bio/db/embl/uniprotkb.rb
1306 def seq
1307   unless @data['']
1308     @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
1309   end
1310   return @data['']
1311 end
Also aliased as: aaseq
sequence_length() click to toggle source

returns a SEQUENCE_LENGTH in the ID line.

A short-cut for Bio::UniProtKB#id_line('SEQUENCE_LENGHT').

    # File lib/bio/db/embl/uniprotkb.rb
117 def sequence_length
118   id_line('SEQUENCE_LENGTH')
119 end
Also aliased as: aalen
set_RN(data) click to toggle source
    # File lib/bio/db/embl/uniprotkb.rb
587 def set_RN(data)
588   data.strip
589 end
sq(key = nil) click to toggle source

returns a Hash of conteins in the SQ lines.

  • Bio::UniProtKBL#sq -> hsh

returns a value of a key given in the SQ lines.

  • Bio::UniProtKBL#sq(key) -> int or str

  • Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length',

    'CRC64']

SQ Line; sequence header (1/entry)

SQ   SEQUENCE   233 AA;  25630 MW;  146A1B48A1475C86 CRC64;
SQ   SEQUENCE  \d+ AA; \d+ MW;  [0-9A-Z]+ CRC64;

MW, Dalton unit. CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).

     # File lib/bio/db/embl/uniprotkb.rb
1278 def sq(key = nil)
1279   unless @data['SQ']
1280     if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
1281       @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
1282     else
1283       raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
1284     end
1285   end
1286 
1287   if key
1288     case key
1289     when /mw/, /molecular/, /weight/
1290       @data['SQ']['MW']
1291     when /len/, /length/, /AA/
1292       @data['SQ']['aalen']
1293     else
1294       @data['SQ'][key]
1295     end
1296   else 
1297     @data['SQ']
1298   end
1299 end
synonyms() click to toggle source

returns synonyms (unofficial and/or alternative names). Returns an Array containing String objects.

Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full or short names which are taken from “RecName: Short=”, “RecName: EC=”, and AltName lines, except after “Contains:” or “Includes:”. For keeping compatibility with old format parser, “RecName: EC=N.N.N.N” is reported as “EC N.N.N.N”. In addition, to prevent confusion, “Allergen=” and “CD_antigen=” prefixes are added for the corresponding fields.

For old format, the method parses the DE lines and returns synonyms. synonyms are each placed in () following the official name on the DE line.

    # File lib/bio/db/embl/uniprotkb.rb
293 def synonyms
294   ary = Array.new
295   @data['DE'] ||= parse_DE_line_rel14(get('DE'))
296   parsed_de_line = @data['DE']
297   if parsed_de_line then
298     # since UniProtKB release 14.0 of 22-Jul-2008
299     parsed_de_line.each do |a|
300       case a[0]
301       when 'Includes', 'Contains'
302         break #the each loop
303       when 'RecName', 'SubName', 'AltName'
304         a[1..-1].each do |b|
305           if name = b[1] and b[1] != self.protein_name then
306             case b[0]
307             when 'EC'
308               name = "EC " + b[1]
309             when 'Allergen', 'CD_antigen'
310               name = b[0] + '=' + b[1]
311             else
312               name = b[1]
313             end
314             ary.push name
315           end
316         end
317       end #case a[0]
318     end #parsed_de_line.each
319   else
320     # old format (before Rel. 13.x)
321     if de_line = fetch('DE') then
322       line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ].  That's the "contains" part
323     line.scan(/\([^)]+/) do |synonym| 
324       unless synonym =~ /fragment/i then 
325         ary << synonym[1..-1].strip # index to remove the leading (
326       end
327       end
328     end
329   end
330   return ary
331 end