"""
mass - molecular masses and isotope distributions
=================================================
Summary
-------
This module defines general functions for mass and isotope abundance
calculations. For most of the functions, the user can define a given
substance in various formats, but all of them would be reduced to the
:py:func:`Composition <Composition.__init__>` object describing its
chemical composition.
Classes
-------
:py:func:`Composition <Composition.__init__>` - a class storing chemical
composition of a substance.
:py:class:`Unimod` - a class representing a Python interface to the
`Unimod database <http://unimod.org/>`_
(see :py:mod:`pyteomics.mass.unimod` for a much more powerful alternative).
Mass calculations
-----------------
:py:func:`calculate_mass` - a general routine for mass / m/z
calculation. Can calculate mass for a polypeptide sequence, chemical
formula or elemental composition. Supplied with an ion type and
charge, the function would calculate m/z.
:py:func:`fast_mass` - a less powerful but much faster function for
polypeptide mass calculation.
:py:func:`fast_mass2` - a version of `fast_mass` that supports *modX* notation.
Isotopic abundances
-------------------
:py:func:`isotopic_composition_abundance` - calculate the relative
abundance of a given isotopic composition.
:py:func:`most_probable_isotopic_composition` - finds the most
abundant isotopic composition for a molecule defined by a
polypeptide sequence, chemical formula or elemental composition.
:py:func:`isotopologues` - iterate over possible isotopic conposition of a molecule,
possibly filtered by abundance.
Data
----
:py:data:`nist_mass` - a dict with exact masses of the most abundant
isotopes.
:py:data:`std_aa_comp` - a dict with the elemental compositions
of the standard twenty amino acid residues, selenocysteine and pyrrolysine.
:py:data:`std_ion_comp` - a dict with the relative elemental
compositions of the standard peptide fragment ions.
:py:data:`std_aa_mass` - a dict with the monoisotopic masses
of the standard twenty amino acid residues, selenocysteine and pyrrolysine.
-----------------------------------------------------------------------------
"""
# Copyright 2012 Anton Goloborodko, Lev Levitsky
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
import math
from .. import parser
from ..auxiliary import PyteomicsError, _nist_mass, BasicComposition
from itertools import chain, product, combinations_with_replacement
from collections import defaultdict
try:
from urllib import urlopen
except ImportError:
from urllib.request import urlopen
from datetime import datetime
import re
import operator
import warnings
nist_mass = _nist_mass
"""
A dict with the exact element masses downloaded from the NIST website:
http://www.nist.gov/pml/data/comp.cfm . There are entries for each
element containing the masses and relative abundances of several
abundant isotopes and a separate entry for undefined isotope with zero
key, mass of the most abundant isotope and 1.0 abundance.
"""
PROTON = 'H+'
def _make_isotope_string(element_name, isotope_num):
"""Form a string label for an isotope."""
if isotope_num == 0:
return element_name
else:
return '{}[{}]'.format(element_name, isotope_num)
def _parse_isotope_string(label):
"""Parse an string with an isotope label and return the element name and
the isotope number.
>>> _parse_isotope_string('C')
('C', 0)
>>> _parse_isotope_string('C[12]')
('C', 12)
"""
element_name, num = re.match(_isotope_string, label).groups()
isotope_num = int(num) if num else 0
return element_name, isotope_num
# Initialize std_aa_comp and std_ion_comp before the Composition class
# description, fill it later.
std_aa_comp = {}
"""A dictionary with elemental compositions of the twenty standard
amino acid residues, selenocysteine, pyrrolysine,
and standard H- and -OH terminal groups.
"""
std_ion_comp = {}
"""A dict with relative elemental compositions of the standard peptide
fragment ions. An elemental composition of a fragment ion is calculated as a
difference between the total elemental composition of an ion
and the sum of elemental compositions of its constituting amino acid residues.
"""
_isotope_string = r'^([A-Z][a-z+]*)(?:\[(\d+)\])?$'
_atom = r'([A-Z][a-z+]*)(?:\[(\d+)\])?([+-]?\d+)?'
_formula = r'^({})*$'.format(_atom)
def _raise_term_label_exception(what='comp'):
raise PyteomicsError("Cannot use a mod label as a terminal group. Provide correct group {0}"
" in `aa_{0}`.".format(what))
class Composition(BasicComposition):
"""
A Composition object stores a chemical composition of a
substance. Basically, it is a dict object, with the names
of chemical elements as keys and values equal to an integer number of
atoms of the corresponding element in a substance.
The main improvement over dict is that Composition objects allow
adding and subtraction.
"""
_kw_sources = {'formula', 'sequence', 'parsed_sequence', 'split_sequence', 'composition'}
_carrier_spec = r"^(?P<formula>\S+?)(?:(?P<sign>[+-])(?P<charge>\d+)?)?$"
def _from_parsed_sequence(self, parsed_sequence, aa_comp):
self.clear()
comp = defaultdict(int)
failflag = False
for label in parsed_sequence:
if label in aa_comp:
for elem, cnt in aa_comp[label].items():
comp[elem] += cnt
elif parser.is_term_group(label):
slabel = label.strip('-')
if slabel in aa_comp:
# a modification label used as terminal group. This is prone to errors and not allowed
_raise_term_label_exception()
elif re.match(_formula, slabel):
comp += Composition(formula=slabel)
else:
failflag = True
else:
try:
mod, aa = parser._split_label(label)
for elem, cnt in chain(
aa_comp[mod].items(), aa_comp[aa].items()):
comp[elem] += cnt
except (PyteomicsError, KeyError):
failflag = True
if failflag:
raise PyteomicsError('No information for %s in `aa_comp`' % label)
self._from_composition(comp)
def _from_split_sequence(self, split_sequence, aa_comp):
self.clear()
comp = defaultdict(int)
for group in split_sequence:
i = 0
while i < len(group):
for j in range(len(group) + 1, -1, -1):
label = ''.join(group[i:j])
if label in aa_comp:
for elem, cnt in aa_comp[label].items():
comp[elem] += cnt
elif parser.is_term_group(label) and label.strip('-') in aa_comp:
_raise_term_label_exception()
else:
continue
i = j
break
if j == 0:
raise PyteomicsError("Invalid group starting from position %d: %s" % (i + 1, group))
self._from_composition(comp)
def _from_sequence(self, sequence, aa_comp):
parsed_sequence = parser.parse(
sequence,
labels=aa_comp,
show_unmodified_termini=True,
allow_unknown_modifications=True)
self._from_parsed_sequence(parsed_sequence, aa_comp)
def _from_formula(self, formula):
if not re.match(_formula, formula):
raise PyteomicsError('Invalid formula: ' + formula)
for elem, isotope, number in re.findall(_atom, formula):
self[_make_isotope_string(elem, int(isotope) if isotope else 0)] += int(number) if number else 1
def _from_composition(self, comp):
for isotope_string, num_atoms in comp.items():
element_name, isotope_num = _parse_isotope_string(
isotope_string)
# Remove explicitly undefined isotopes (e.g. X[0]).
self[_make_isotope_string(element_name, isotope_num)] = num_atoms
[docs]
def __init__(self, *args, **kwargs):
"""
A Composition object stores a chemical composition of a
substance. Basically it is a dict object, in which keys are the names
of chemical elements and values contain integer numbers of
corresponding atoms in a substance.
The main improvement over dict is that Composition objects allow
addition and subtraction.
A Composition object can be initialized with one of the
following arguments: formula, sequence, parsed_sequence or
split_sequence.
If none of these are specified, the constructor will look at the first
positional argument and try to build the object from it. Without
positional arguments, a Composition will be constructed directly from
keyword arguments.
If there's an ambiguity, i.e. the argument is both a valid sequence
and a formula (such as 'HCN'), it will be treated as a sequence. You
need to provide the 'formula' keyword to override this.
.. warning::
Be careful when supplying a list with a parsed sequence or a split
sequence as a keyword argument. It must be
obtained with enabled `show_unmodified_termini` option.
When supplying it as a positional argument, the option doesn't
matter, because the positional argument is always converted to
a sequence prior to any processing.
Parameters
----------
formula : str, optional
A string with a chemical formula.
sequence : str, optional
A polypeptide sequence string in modX notation.
parsed_sequence : list of str, optional
A polypeptide sequence parsed into a list of amino acids.
split_sequence : list of tuples of str, optional
A polypeptyde sequence parsed into a list of tuples
(as returned be :py:func:`pyteomics.parser.parse` with
``split=True``).
aa_comp : dict, optional
A dict with the elemental composition of the amino acids (the
default value is std_aa_comp).
ion_comp : dict, optional
A dict with the relative elemental compositions of peptide ion
fragments (default is :py:data:`std_ion_comp`).
ion_type : str, optional
If specified, then the polypeptide is considered to be in the form
of the corresponding ion.
"""
defaultdict.__init__(self, int)
aa_comp = kwargs.get('aa_comp', std_aa_comp)
kw_given = self._kw_sources.intersection(kwargs)
if len(kw_given) > 1:
raise PyteomicsError('Only one of {} can be specified!\n'
'Given: {}'.format(', '.join(self._kw_sources),
', '.join(kw_given)))
elif kw_given:
kwa = kw_given.pop()
if kwa == 'formula':
self._from_formula(kwargs['formula'])
else:
getattr(self, '_from_' + kwa)(kwargs[kwa], aa_comp)
# can't build from kwargs
elif args:
if isinstance(args[0], dict):
self._from_composition(args[0])
elif isinstance(args[0], str):
try:
self._from_sequence(args[0], aa_comp)
except PyteomicsError:
try:
self._from_formula(args[0])
except PyteomicsError:
raise PyteomicsError(
'Could not create a Composition object from '
'string: "{}": not a valid sequence or '
'formula'.format(args[0]))
else:
try:
self._from_sequence(parser.tostring(args[0], True), aa_comp)
except Exception:
raise PyteomicsError('Could not create a Composition object'
' from `{}`. A Composition object must be '
'specified by sequence, parsed or split sequence,'
' formula or dict.'.format(args[0]))
else:
self._from_composition(kwargs)
ion_comp = kwargs.get('ion_comp', std_ion_comp)
if 'ion_type' in kwargs:
self += ion_comp[kwargs['ion_type']]
# Charge is not supported in kwargs
charge = self['H+']
if 'charge' in kwargs:
if charge:
raise PyteomicsError('Charge is specified both by the number of protons and `charge` in kwargs')
else:
warnings.warn('charge and charge carrier should be specified when calling mass(). '
'Support for charge in Composition.__init__ will be removed in a future version.',
FutureWarning)
self['H+'] = kwargs['charge']
@classmethod
def _parse_carrier(cls, spec):
"""Parse a charge carrier spec.
The spec syntax is: <formula>[+-][N]
<formula> is a chemical formula as supported by :py:meth:`_from_formula`.
[+-] is one of "+" or "-", N is a natural number (1 is assumed if omitted).
If both the sign and the charge are missing, the charge of this group can be
specified as the number of protons in `<formula>`. Otherwise, having protons
in `<formula>` is an error.
Returns
-------
out : tuple
Parsed :py:class:`Composition` and charge of the charge carrier.
"""
if spec is None:
return cls({PROTON: 1}), 1
try:
formula, sign, charge = re.match(cls._carrier_spec, spec).groups()
except AttributeError:
raise PyteomicsError('Invalid charge carrier specification: ' + spec)
comp = cls(formula=formula)
if sign is not None and PROTON in comp:
raise PyteomicsError('Carrier contains protons and also has a charge specified.')
if sign is None:
# only formula is given
if PROTON not in comp:
charge = None
charge = comp[PROTON]
elif charge is None:
charge = (-1, 1)[sign == '+']
else:
charge = int(charge) * (-1, 1)[sign == '+']
return comp, charge
@staticmethod
def _mass_to_mz(mass, composition=None, **kwargs):
mass_data = kwargs.get('mass_data', nist_mass)
absolute = kwargs.get('absolute', True)
average = kwargs.get('average', False)
# Calculate m/z if required
charge = kwargs.get('charge')
if charge:
# get charge carrier mass and charge
charge_carrier = kwargs.get('charge_carrier')
ccharge = kwargs.get('carrier_charge')
if isinstance(charge_carrier, dict):
carrier_comp = Composition(charge_carrier)
if ccharge and PROTON in carrier_comp:
raise PyteomicsError('`carrier_charge` specified but the charge carrier contains protons.')
carrier_charge = ccharge or carrier_comp[PROTON]
if not carrier_charge:
raise PyteomicsError('Charge carrier charge not specified.')
else:
carrier_comp, carrier_charge = (composition or Composition)._parse_carrier(charge_carrier)
if carrier_charge and ccharge:
raise PyteomicsError('Both `carrier_charge` and charge in carrier spec are given.')
carrier_charge = ccharge or carrier_charge
if not carrier_charge:
raise PyteomicsError('Charge of the charge carrier group not specified.')
if charge % carrier_charge:
raise PyteomicsError('The `charge` must be a multiple of the carrier charge. Given: {} and {}'.format(
charge, carrier_charge))
num = charge // carrier_charge
carrier_mass = carrier_comp.mass(mass_data=mass_data, average=average, charge=0)
if charge and (composition is None or not composition['H+']):
mass += carrier_mass * num
if charge and composition and composition['H+']:
raise PyteomicsError('Composition contains protons and charge is explicitly specified.')
if charge is None and composition and composition['H+']:
warnings.warn('Charge is not specified, but the Composition contains protons. Assuming m/z calculation.')
charge = composition['H+']
if charge:
mass /= charge
if charge and charge < 0 and absolute:
mass = abs(mass)
return mass
[docs]
def mass(self, **kwargs):
"""Calculate the mass or *m/z* of a :py:class:`Composition`.
Parameters
----------
average : bool, optional
If :py:const:`True` then the average mass is calculated.
Note that mass is not averaged for elements with specified isotopes.
Default is :py:const:`False`.
charge : int, optional
If not 0 then m/z is calculated. See also: `charge_carrier`.
charge_carrier : str or dict, optional
Chemical group carrying the charge. Defaults to a proton, "H+".
If string, must be a chemical formula, as supported by the
:class:`Composition` `formula` argument,
except it must end with a charge formatted as "[+-][N]".
If N is omitted, single charge is assumed.
Examples of `charge_carrier`: "H+", "NH3+"
(here, 3 is part of the composition, and + is a single charge),
"Fe+2" ("Fe" is the formula and "+2" is the charge).
.. note :: `charge` must be a multiple of `charge_carrier` charge.
If dict, it is the atomic composition of the group.
In this case, the charge can be passed separately as `carrier_charge`
or it will be deduced from the number of protons in `charge_carrier`.
carrier_charge : int, optional
Charge of the charge carrier group (if `charge_carrier` is specified
as a composition dict).
.. note :: `charge` must be a multiple of `charge_charge`.
mass_data : dict, optional
A dict with the masses of the chemical elements (the default
value is :py:data:`nist_mass`).
ion_comp : dict, optional
A dict with the relative elemental compositions of peptide ion
fragments (default is :py:data:`std_ion_comp`).
ion_type : str, optional
If specified, then the polypeptide is considered to be in the form
of the corresponding ion. Do not forget to specify the charge state!
absolute : bool, optional
If :py:const:`True` (default), the m/z value returned will always be positive,
even for negatively charged ions.
.. note ::
`absolute` only applies when `charge` is negative.
The mass can still be negative for negative compositions.
Returns
-------
mass : float
"""
mass_data = kwargs.get('mass_data', nist_mass)
# Calculate mass
mass = 0.0
average = kwargs.get('average', False)
try:
for isotope_string, amount in self.items():
element_name, isotope_num = _parse_isotope_string(isotope_string)
# Calculate average mass if required and the isotope number is
# not specified.
if (not isotope_num) and average:
for isotope, data in mass_data[element_name].items():
if isotope:
mass += (amount * data[0] * data[1])
else:
mass += (amount * mass_data[element_name][isotope_num][0])
except KeyError as e:
raise PyteomicsError('No mass information for element: {}'.format(e.args[0]))
return self._mass_to_mz(mass, self, **kwargs)
std_aa_comp.update({
'A': Composition({'H': 5, 'C': 3, 'O': 1, 'N': 1}),
'C': Composition({'H': 5, 'C': 3, 'S': 1, 'O': 1, 'N': 1}),
'D': Composition({'H': 5, 'C': 4, 'O': 3, 'N': 1}),
'E': Composition({'H': 7, 'C': 5, 'O': 3, 'N': 1}),
'F': Composition({'H': 9, 'C': 9, 'O': 1, 'N': 1}),
'G': Composition({'H': 3, 'C': 2, 'O': 1, 'N': 1}),
'H': Composition({'H': 7, 'C': 6, 'N': 3, 'O': 1}),
'I': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}),
'J': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}),
'K': Composition({'H': 12, 'C': 6, 'N': 2, 'O': 1}),
'L': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}),
'M': Composition({'H': 9, 'C': 5, 'S': 1, 'O': 1, 'N': 1}),
'N': Composition({'H': 6, 'C': 4, 'O': 2, 'N': 2}),
'P': Composition({'H': 7, 'C': 5, 'O': 1, 'N': 1}),
'Q': Composition({'H': 8, 'C': 5, 'O': 2, 'N': 2}),
'R': Composition({'H': 12, 'C': 6, 'N': 4, 'O': 1}),
'S': Composition({'H': 5, 'C': 3, 'O': 2, 'N': 1}),
'T': Composition({'H': 7, 'C': 4, 'O': 2, 'N': 1}),
'V': Composition({'H': 9, 'C': 5, 'O': 1, 'N': 1}),
'W': Composition({'C': 11, 'H': 10, 'N': 2, 'O': 1}),
'Y': Composition({'H': 9, 'C': 9, 'O': 2, 'N': 1}),
'U': Composition({'H': 5, 'C': 3, 'O': 1, 'N': 1, 'Se' : 1}),
'O': Composition({'H': 19, 'C': 12, 'O': 2, 'N': 3}),
'H-': Composition({'H': 1}),
'-OH': Composition({'O': 1, 'H': 1}),
})
std_ion_comp.update({
'M': Composition(formula=''),
'M-H2O': Composition(formula='H-2O-1'),
'M-NH3': Composition(formula='N-1H-3'),
'a': Composition(formula='H-2O-1' + 'C-1O-1'),
'a-H2O': Composition(formula='H-2O-1' + 'C-1O-1' + 'H-2O-1'),
'a-NH3': Composition(formula='H-2O-1' + 'C-1O-1' + 'N-1H-3'),
'b': Composition(formula='H-2O-1'),
'b-H2O': Composition(formula='H-2O-1' + 'H-2O-1'),
'b-NH3': Composition(formula='H-2O-1' + 'N-1H-3'),
'c': Composition(formula='H-2O-1' + 'NH3'),
'c-1': Composition(formula='H-2O-1' + 'NH3' + 'H-1'),
'c-dot': Composition(formula='H-2O-1' + 'NH3' + 'H1'),
'c+1': Composition(formula='H-2O-1' + 'NH3' + 'H1'),
'c+2': Composition(formula='H-2O-1' + 'NH3' + 'H2'),
'c-H2O': Composition(formula='H-2O-1' + 'NH3' + 'H-2O-1'),
'c-NH3': Composition(formula='H-2O-1'),
'x': Composition(formula='H-2O-1' + 'CO2'),
'x-H2O': Composition(formula='H-2O-1' + 'CO2' + 'H-2O-1'),
'x-NH3': Composition(formula='H-2O-1' + 'CO2' + 'N-1H-3'),
'y': Composition(formula=''),
'y-H2O': Composition(formula='H-2O-1'),
'y-NH3': Composition(formula='N-1H-3'),
'z': Composition(formula='H-2O-1' + 'ON-1H-1'),
'z-dot': Composition(formula='H-2O-1' + 'ON-1'),
'z+1': Composition(formula='H-2O-1' + 'ON-1H1'),
'z+2': Composition(formula='H-2O-1' + 'ON-1H2'),
'z+3': Composition(formula='H-2O-1' + 'ON-1H3'),
'z-H2O': Composition(formula='H-2O-1' + 'ON-1H-1' + 'H-2O-1'),
'z-NH3': Composition(formula='H-2O-1' + 'ON-1H-1' + 'N-1H-3'),
})
[docs]
def calculate_mass(*args, **kwargs):
"""Calculates the monoisotopic mass of a polypeptide defined by a
sequence string, parsed sequence, chemical formula or
Composition object.
One or none of the following keyword arguments is required:
**formula**, **sequence**, **parsed_sequence**, **split_sequence**
or **composition**.
All arguments given are used to create a :py:class:`Composition` object,
unless an existing one is passed as a keyword argument.
Note that if a sequence string is supplied and terminal groups are not
explicitly shown, then the mass is calculated for a polypeptide with
standard terminal groups (NH2- and -OH).
.. warning::
Be careful when supplying a list with a parsed sequence. It must be
obtained with enabled `show_unmodified_termini` option.
Parameters
----------
formula : str, optional
A string with a chemical formula.
sequence : str, optional
A polypeptide sequence string in modX notation.
proforma : str, optional
A polypeptide sequeence string in `ProForma notation <https://www.psidev.info/proforma>`_,
or a :py:class:`pyteomics.proforma.ProForma` object.
parsed_sequence : list of str, optional
A polypeptide sequence parsed into a list of amino acids.
composition : Composition, optional
A Composition object with the elemental composition of a substance.
aa_comp : dict, optional
A dict with the elemental composition of the amino acids (the
default value is std_aa_comp).
average : bool, optional
If :py:const:`True` then the average mass is calculated. Note that mass
is not averaged for elements with specified isotopes. Default is
:py:const:`False`.
charge : int, optional
If not 0 then m/z is calculated: the mass is increased
by the corresponding number of proton masses and divided
by `charge`.
charge_carrier : str or dict, optional
Chemical group carrying the charge. Defaults to a proton, "H+".
If string, must be a chemical formula, as supported by the
:class:`Composition` `formula` argument,
except it must end with a charge formatted as "[+-][N]".
If N is omitted, single charge is assumed.
Examples of `charge_carrier`: "H+", "NH3+"
(here, 3 is part of the composition, and + is a single charge),
"Fe+2" ("Fe" is the formula and "+2" is the charge).
.. note ::
`charge` must be a multiple of `charge_carrier` charge.
If dict, it is the atomic composition of the group.
In this case, the charge can be passed separately as `carrier_charge`
or it will be deduced from the number of protons in `charge_carrier`.
carrier_charge : int, optional
Charge of the charge carrier group (if `charge_carrier` is specified
as a composition dict).
.. note ::
`charge` must be a multiple of `charge_charge`.
mass_data : dict, optional
A dict with the masses of the chemical elements (the default
value is :py:data:`nist_mass`).
ion_comp : dict, optional
A dict with the relative elemental compositions of peptide ion
fragments (default is :py:data:`std_ion_comp`).
ion_type : str, optional
If specified, then the polypeptide is considered to be in the form
of the corresponding ion. Do not forget to specify the charge state!
absolute : bool, optional
If :py:const:`True` (default), the m/z value returned will always be positive,
even for negatively charged ions.
.. note ::
`absolute` only applies when `charge` is negative.
The mass can still be negative for negative compositions.
Returns
-------
mass : float
"""
if 'proforma' in kwargs:
# do not try to create a composition
from .. import proforma
proteoform = kwargs.pop('proforma')
if isinstance(proteoform, str):
proteoform = proforma.ProForma.parse(proteoform)
return Composition._mass_to_mz(proteoform.mass, **kwargs)
# These parameters must be passed to mass(), not __init__
mass_kw = {}
for k in ['charge', 'charge_carrier', 'carrier_charge', 'absolute']:
if k in kwargs:
mass_kw[k] = kwargs.pop(k)
# Make a copy of `composition` keyword argument.
if 'composition' in kwargs:
composition = Composition(kwargs.pop('composition'))
else:
composition = Composition(*args, **kwargs)
kwargs.update(mass_kw)
return composition.mass(**kwargs)
[docs]
def most_probable_isotopic_composition(*args, **kwargs):
"""Calculate the most probable isotopic composition of a peptide
molecule/ion defined by a sequence string, parsed sequence,
chemical formula or :py:class:`Composition` object.
Note that if a sequence string without terminal groups is supplied then the
isotopic composition is calculated for a polypeptide with standard
terminal groups (H- and -OH).
For each element, only two most abundant isotopes are considered.
Parameters
----------
formula : str, optional
A string with a chemical formula.
sequence : str, optional
A polypeptide sequence string in modX notation.
parsed_sequence : list of str, optional
A polypeptide sequence parsed into a list of amino acids.
composition : :py:class:`Composition`, optional
A :py:class:`Composition` object with the elemental composition of a
substance.
elements_with_isotopes : list of str
A list of elements to be considered in isotopic distribution
(by default, every element has a isotopic distribution).
aa_comp : dict, optional
A dict with the elemental composition of the amino acids (the
default value is :py:data:`std_aa_comp`).
mass_data : dict, optional
A dict with the masses of chemical elements (the default
value is :py:data:`nist_mass`).
ion_comp : dict, optional
A dict with the relative elemental compositions of peptide ion
fragments (default is :py:data:`std_ion_comp`).
Returns
-------
out: tuple (Composition, float)
A tuple with the most probable isotopic composition and its
relative abundance.
"""
composition = (dict(kwargs['composition']) if 'composition' in kwargs
else Composition(*args, **kwargs))
# Removing isotopes from the composition.
for isotope_string in composition:
element_name, isotope_num = _parse_isotope_string(isotope_string)
if isotope_num:
composition[element_name] += composition.pop(isotope_string)
mass_data = kwargs.get('mass_data', nist_mass)
elements_with_isotopes = kwargs.get('elements_with_isotopes')
isotopic_composition = Composition()
for element_name in composition:
if not elements_with_isotopes or (element_name in elements_with_isotopes):
# Take the two most abundant isotopes.
first_iso, second_iso = sorted([(i[0], i[1][1]) for i in mass_data[element_name].items() if i[0]],
key=lambda x: -x[1])[:2]
# Write the number of isotopes of the most abundant type.
first_iso_str = _make_isotope_string(element_name, first_iso[0])
isotopic_composition[first_iso_str] = int(math.ceil(
composition[element_name])) * first_iso[1]
# Write the number of the second isotopes.
second_iso_str = _make_isotope_string(element_name, second_iso[0])
isotopic_composition[second_iso_str] = composition[element_name] - isotopic_composition[first_iso_str]
else:
isotopic_composition[element_name] = composition[element_name]
return (isotopic_composition,
isotopic_composition_abundance(composition=isotopic_composition, mass_data=mass_data))
[docs]
def isotopic_composition_abundance(*args, **kwargs):
"""Calculate the relative abundance of a given isotopic composition
of a molecule.
Parameters
----------
formula : str, optional
A string with a chemical formula.
composition : Composition, optional
A Composition object with the isotopic composition of a substance.
mass_data : dict, optional
A dict with the masses of chemical elements (the default
value is :py:data:`nist_mass`).
Returns
-------
relative_abundance : float
The relative abundance of a given isotopic composition.
"""
composition = (Composition(kwargs['composition'])
if 'composition' in kwargs
else Composition(*args, **kwargs))
isotopic_composition = defaultdict(dict)
# Check if there are default and non-default isotopes of the same
# element and rearrange the elements.
for element in composition:
element_name, isotope_num = _parse_isotope_string(element)
# If there is already an entry for this element and either it
# contains a default isotope or newly added isotope is default
# then raise an exception.
if (element_name in isotopic_composition) and (isotope_num == 0 or 0 in isotopic_composition[element_name]):
raise PyteomicsError(
'Please specify the isotopic states of all atoms of %s or do not specify them at all.' % element_name)
else:
isotopic_composition[element_name][isotope_num] = composition[element]
# Calculate relative abundance.
mass_data = kwargs.get('mass_data', nist_mass)
num1, num2, denom = 1, 1, 1
for element_name, isotope_dict in isotopic_composition.items():
num1 *= math.factorial(sum(isotope_dict.values()))
for isotope_num, isotope_content in isotope_dict.items():
denom *= math.factorial(isotope_content)
if isotope_num:
num2 *= mass_data[element_name][isotope_num][1] ** isotope_content
return num2 * (num1 / denom)
[docs]
def isotopologues(*args, **kwargs):
"""Iterate over possible isotopic states of a molecule.
The molecule can be defined by formula, sequence, parsed sequence, or composition.
The space of possible isotopic compositions is restrained by parameters
``elements_with_isotopes``, ``isotope_threshold``, ``overall_threshold``.
Parameters
----------
formula : str, optional
A string with a chemical formula.
sequence : str, optional
A polypeptide sequence string in modX notation.
parsed_sequence : list of str, optional
A polypeptide sequence parsed into a list of amino acids.
composition : :py:class:`Composition`, optional
A :py:class:`Composition` object with the elemental composition of a
substance.
report_abundance : bool, optional
If :py:const:`True`, the output will contain 2-tuples: `(composition, abundance)`.
Otherwise, only compositions are yielded. Default is :py:const:`False`.
elements_with_isotopes : container of str, optional
A set of elements to be considered in isotopic distribution
(by default, every element has an isotopic distribution).
isotope_threshold : float, optional
The threshold abundance of a specific isotope to be considered.
Default is :py:const:`5e-4`.
overall_threshold : float, optional
The threshold abundance of the calculateed isotopic composition.
Default is :py:const:`0`.
aa_comp : dict, optional
A dict with the elemental composition of the amino acids (the
default value is :py:data:`std_aa_comp`).
mass_data : dict, optional
A dict with the masses of chemical elements (the default
value is :py:data:`nist_mass`).
Returns
-------
out : iterator
Iterator over possible isotopic compositions.
"""
iso_threshold = kwargs.pop('isotope_threshold', 5e-4)
overall_threshold = kwargs.pop('overall_threshold', 0.0)
mass_data = kwargs.get('mass_data', nist_mass)
elements_with_isotopes = kwargs.get('elements_with_isotopes')
report_abundance = kwargs.get('report_abundance', False)
composition = Composition(kwargs['composition']) if 'composition' in kwargs else Composition(*args, **kwargs)
other_kw = kwargs.copy()
for k in Composition._kw_sources:
other_kw.pop(k, None)
dict_elem_isotopes = {}
for element in composition:
if elements_with_isotopes is None or element in elements_with_isotopes:
element_name, isotope_num = _parse_isotope_string(element)
isotopes = {k: v for k, v in mass_data[element_name].items() if k != 0 and v[1] >= iso_threshold}
list_isotopes = [_make_isotope_string(element_name, k) for k in isotopes]
dict_elem_isotopes[element] = list_isotopes
else:
dict_elem_isotopes[element] = [element]
all_isotoplogues = []
for element, list_isotopes in dict_elem_isotopes.items():
n = composition[element]
list_comb_element_n = []
for elementXn in combinations_with_replacement(list_isotopes, n):
list_comb_element_n.append(elementXn)
all_isotoplogues.append(list_comb_element_n)
for isotopologue in product(*all_isotoplogues):
ic = Composition(formula=''.join(atom for el in isotopologue for atom in el), **other_kw)
if report_abundance or overall_threshold > 0.0:
abundance = isotopic_composition_abundance(composition=ic, **other_kw)
if abundance > overall_threshold:
if report_abundance:
yield (ic, abundance)
else:
yield ic
else:
yield ic
std_aa_mass = {
'G': 57.02146372057,
'A': 71.03711378471,
'S': 87.03202840427001,
'P': 97.05276384885,
'V': 99.06841391299,
'T': 101.04767846841,
'C': 103.00918478471,
'L': 113.08406397713001,
'I': 113.08406397713001,
'J': 113.08406397713001,
'N': 114.04292744114001,
'D': 115.02694302383001,
'Q': 128.05857750527997,
'K': 128.09496301399997,
'E': 129.04259308796998,
'M': 131.04048491299,
'H': 137.05891185845002,
'F': 147.06841391298997,
'U': 150.95363508471,
'R': 156.10111102359997,
'Y': 163.06332853254997,
'W': 186.07931294985997,
'O': 237.14772686284996}
"""A dictionary with monoisotopic masses of the twenty standard
amino acid residues, selenocysteine and pyrrolysine.
"""
[docs]
def fast_mass(sequence, ion_type=None, charge=None, **kwargs):
"""Calculate monoisotopic mass of an ion using the fast
algorithm. May be used only if amino acid residues are presented in
one-letter code.
Parameters
----------
sequence : str
A polypeptide sequence string.
ion_type : str, optional
If specified, then the polypeptide is considered to be
in a form of corresponding ion. Do not forget to
specify the charge state!
charge : int, optional
If not 0 then m/z is calculated: the mass is increased
by the corresponding number of proton masses and divided
by z.
mass_data : dict, optional
A dict with the masses of chemical elements (the default
value is :py:data:`nist_mass`).
aa_mass : dict, optional
A dict with the monoisotopic mass of amino acid residues
(default is std_aa_mass);
ion_comp : dict, optional
A dict with the relative elemental compositions of peptide ion
fragments (default is :py:data:`std_ion_comp`).
Returns
-------
mass : float
Monoisotopic mass or m/z of a peptide molecule/ion.
"""
aa_mass = kwargs.get('aa_mass', std_aa_mass)
try:
mass = sum(aa_mass[i] for i in sequence)
except KeyError as e:
raise PyteomicsError('No mass data for residue: ' + e.args[0])
mass_data = kwargs.get('mass_data', nist_mass)
mass += mass_data['H'][0][0] * 2 + mass_data['O'][0][0]
if ion_type:
try:
icomp = kwargs.get('ion_comp', std_ion_comp)[ion_type]
except KeyError:
raise PyteomicsError('Unknown ion type: {}'.format(ion_type))
mass += sum(mass_data[element][0][0] * num for element, num in icomp.items())
if charge:
mass = (mass + mass_data['H+'][0][0] * charge) / charge
return mass
[docs]
def fast_mass2(sequence, ion_type=None, charge=None, **kwargs):
"""Calculate monoisotopic mass of an ion using the fast
algorithm. *modX* notation is fully supported.
Parameters
----------
sequence : str
A polypeptide sequence string.
ion_type : str, optional
If specified, then the polypeptide is considered to be
in a form of corresponding ion. Do not forget to
specify the charge state!
charge : int, optional
If not 0 then m/z is calculated: the mass is increased
by the corresponding number of proton masses and divided
by z.
mass_data : dict, optional
A dict with the masses of chemical elements (the default
value is :py:data:`nist_mass`).
aa_mass : dict, optional
A dict with the monoisotopic mass of amino acid residues
(default is std_aa_mass).
ion_comp : dict, optional
A dict with the relative elemental compositions of peptide ion
fragments (default is :py:data:`std_ion_comp`).
Returns
-------
mass : float
Monoisotopic mass or m/z of a peptide molecule/ion.
"""
aa_mass = kwargs.get('aa_mass', std_aa_mass)
mass_data = kwargs.get('mass_data', nist_mass)
try:
comp = parser.amino_acid_composition(sequence,
show_unmodified_termini=True,
allow_unknown_modifications=True,
labels=aa_mass)
except PyteomicsError:
raise PyteomicsError('Mass not specified for label(s): {}'.format(
', '.join(set(parser.parse(sequence)).difference(aa_mass))))
try:
mass = 0
for aa, num in comp.items():
if aa in aa_mass:
mass += aa_mass[aa] * num
elif parser.is_term_mod(aa):
assert num == 1
group = aa.strip('-')
if group.islower() and group in aa_mass:
_raise_term_label_exception('mass')
else:
try:
mass += calculate_mass(formula=group, mass_data=mass_data)
except PyteomicsError:
raise PyteomicsError('Could not parse terminal group as a formula: {}'.format(group))
else:
mod, X = parser._split_label(aa)
mass += (aa_mass[mod] + aa_mass[X]) * num
except KeyError as e:
raise PyteomicsError('Unspecified mass for modification: "{}"'.format(e.args[0]))
if ion_type:
try:
icomp = kwargs.get('ion_comp', std_ion_comp)[ion_type]
except KeyError:
raise PyteomicsError('Unknown ion type: {}'.format(ion_type))
mass += sum(mass_data[element][0][0] * num
for element, num in icomp.items())
if charge:
mass = (mass + mass_data['H+'][0][0] * charge) / charge
return mass
[docs]
class Unimod():
"""A class for Unimod database of modifications.
The list of all modifications can be retrieved via `mods` attribute.
Methods for convenient searching are `by_title` and `by_name`.
For more elaborate filtering, iterate manually over the list.
.. note::
See :py:mod:`pyteomics.mass.unimod` for a new alternative class with
more features.
"""
[docs]
def __init__(self, source='http://www.unimod.org/xml/unimod.xml'):
"""Create a database and fill it from XML file retrieved from `source`.
Parameters
----------
source : str or file, optional
A file-like object or a URL to read from. Don't forget the ``'file://'``
prefix when pointing to local files.
"""
from lxml import etree
from ..xml import _local_name
def process_mod(mod):
d = mod.attrib
new_d = {}
for key in ('date_time_modified', 'date_time_posted'):
new_d[key] = datetime.strptime(d.pop(key), '%Y-%m-%d %H:%M:%S')
comp = Composition()
for delta in self._xpath('delta', mod): # executed 1 time
for key in ('avge_mass', 'mono_mass'):
new_d[key] = float(delta.attrib.pop(key))
for elem in self._xpath('element', delta):
e_d = elem.attrib
amount = int(e_d.pop('number'))
label = e_d.pop('symbol')
isotope, symbol = re.match(r'^(\d*)(\D+)$', label).groups()
if not isotope:
isotope = 0
else:
isotope = int(isotope)
comp += Composition(formula=_make_isotope_string(symbol, isotope), mass_data=self._massdata) * amount
new_d['composition'] = comp
new_d['record_id'] = int(d.pop('record_id'))
new_d['approved'] = d.pop('approved') == '1'
new_d.update(d)
spec = []
for sp in self._xpath('specificity', mod):
sp_d = sp.attrib
sp_new_d = {}
sp_new_d['hidden'] = (sp_d.pop('hidden') == '1')
sp_new_d['spec_group'] = int(sp_d.pop('spec_group'))
sp_new_d.update(sp_d)
notes = []
for note in self._xpath('*', sp):
if note.text and note.text.strip():
notes.append(note.text.strip())
if notes:
sp_new_d['note'] = '\n'.join(notes)
spec.append(sp_new_d)
new_d['specificity'] = spec
alt_names = []
for alt_name in self._xpath('alt_name', mod):
alt_names.append(alt_name.text)
if alt_names:
new_d['alt_names'] = alt_names
refs = []
for ref in self._xpath('xref', mod):
ref_d = {}
for sub in ref.iterchildren():
ref_d[_local_name(sub)] = sub.text
for key in ('text', 'source', 'url'):
if key not in ref_d:
ref_d[key] = None
refs.append(ref_d)
new_d['refs'] = refs
return new_d
if isinstance(source, str):
self._tree = etree.parse(urlopen(source))
else:
self._tree = etree.parse(source)
self._massdata = self._mass_data()
self._mods = []
self._id = {}
for i, mod in enumerate(self._xpath('/unimod/modifications/mod')):
mod_dict = process_mod(mod)
self._mods.append(mod_dict)
self._id[mod_dict['record_id']] = i
def _xpath(self, path, element=None):
from ..xml import xpath
if element is None:
return xpath(self._tree, path, 'umod')
return xpath(element, path, 'umod')
def _mass_data(self):
massdata = defaultdict(dict)
elements = [x.attrib for x in self._xpath('/unimod/elements/elem')]
avg = {}
for elem in elements:
i, label = re.match(r'^(\d*)(\D+)$', elem['title']).groups()
if not i:
iso = 0
else:
iso = int(i)
massdata[label][iso] = (float(elem['mono_mass']), float(iso == 0))
if not iso:
avg[label] = float(elem['avge_mass'])
for elem, isotopes in massdata.items():
isotopes[int(round(isotopes[0][0]))] = isotopes[0]
if len(isotopes) == 3:
m1, m2 = (x[1][0] for x in sorted(isotopes.items())[1:])
m_avg = avg[elem]
a = (m2 - m_avg) / (m2 - m1)
b = (m_avg - m1) / (m2 - m1)
for state, abundance in zip(sorted(isotopes)[1:], (a, b)):
isotopes[state] = (isotopes[state][0], abundance)
return massdata
@property
def mods(self):
"""Get the list of Unimod modifications"""
return self._mods
@property
def mass_data(self):
"""Get element mass data extracted from the database"""
return self._massdata
[docs]
def by_title(self, title, strict=True):
"""Search modifications by title. If a single modification is found,
it is returned. Otherwise, a list will be returned.
Parameters
----------
title : str
The modification title.
strict : bool, optional
If :py:const:`False`, the search will return all modifications
whose title **contains** `title`, otherwise equality is required.
:py:const:`True` by default.
Returns
-------
out : dict or list
A single modification or a list of modifications.
"""
f = {True: operator.eq, False: operator.contains}
func = f[strict]
result = [m for m in self._mods if func(m['title'], title)]
if len(result) == 1:
return result[0]
return result
[docs]
def by_name(self, name, strict=True):
"""Search modifications by name. If a single modification is found,
it is returned. Otherwise, a list will be returned.
Parameters
----------
name : str
The full name of the modification(s).
strict : bool, optional
If :py:const:`False`, the search will return all modifications
whose full name **contains** `title`, otherwise equality is
required. :py:const:`True` by default.
Returns
-------
out : dict or list
A single modification or a list of modifications.
"""
f = {True: operator.eq, False: operator.contains}
func = f[strict]
result = [m for m in self._mods if func(m['full_name'], name)]
if len(result) == 1:
return result[0]
return result
[docs]
def by_id(self, i):
"""Search modifications by record ID. If a modification is found,
it is returned. Otherwise, :py:exc:`KeyError` is raised.
Parameters
----------
i : int or str
The Unimod record ID.
Returns
-------
out : dict
A single modification dict.
"""
if isinstance(i, str):
i = int(i)
return self._mods[self._id[i]]
__getitem__ = by_id
def neutral_mass(mz, z, charge_carrier=_nist_mass[PROTON][0][0]):
return (mz * abs(z)) - (z * charge_carrier)
def mass_charge_ratio(neutral_mass, z, charge_carrier=_nist_mass[PROTON][0][0]):
return (neutral_mass + (z * charge_carrier)) / abs(z)