Pyteomics documentation v5.0

pyteomics.proforma

Contents

Source code for pyteomics.proforma

'''
proforma - Proteoform and Peptidoform Notation
==============================================

ProForma is a notation for defining modified amino acid sequences using
a set of controlled vocabularies, as well as encoding uncertain or partial
information about localization. See `ProForma specification <https://www.psidev.info/proforma>`_
for more up-to-date information.

For more details, see the :mod:`pyteomics.proforma` online.
'''

import itertools
import re
import warnings
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, ClassVar, Sequence, Tuple, Type, Union, Generic, TypeVar, NamedTuple
from collections import Counter, deque, namedtuple
from functools import partial
from itertools import chain
from array import array as _array
from enum import Enum
from numbers import Integral

from .mass import Composition, std_aa_mass, Unimod, nist_mass, calculate_mass, std_ion_comp, mass_charge_ratio, std_aa_comp
from .auxiliary import PyteomicsError, BasicComposition
from .auxiliary.utils import add_metaclass, memoize
from .auxiliary.psims_util import load_psimod, load_xlmod, load_gno, obo_cache, _has_psims

try:
    import numpy as np
except ImportError:
    np = None


_WATER_MASS = calculate_mass(formula="H2O")

std_aa_mass = std_aa_mass.copy()
std_aa_mass['X'] = 0

element_symbols = set(nist_mass)

T = TypeVar('T')


class ProFormaError(PyteomicsError):
    def __init__(self, message, index=None, parser_state=None, **kwargs):
        super(ProFormaError, self).__init__(PyteomicsError, message, index, parser_state)
        self.message = message
        self.index = index
        self.parser_state = parser_state


class PrefixSavingMeta(type):
    '''A subclass-registering-metaclass that provides easy
    lookup of subclasses by prefix attributes.
    '''

    def __new__(mcs, name, parents, attrs):
        new_type = type.__new__(mcs, name, parents, attrs)
        prefix = attrs.get("prefix_name")
        if prefix:
            new_type.prefix_map[prefix.lower()] = new_type
        short = attrs.get("short_prefix")
        if short:
            new_type.prefix_map[short.lower()] = new_type
        return new_type

    def find_by_tag(self, tag_name):
        if tag_name is None:
            raise ValueError("tag_name cannot be None!")
        tag_name = tag_name.lower()
        return self.prefix_map[tag_name]


[docs] class TagTypeEnum(Enum): unimod = 0 psimod = 1 massmod = 2 generic = 3 info = 4 gnome = 5 xlmod = 6 formula = 7 glycan = 8 localization_marker = 9 position_label = 10 position_modifier = 11 comup = 12 comkp = 13 limit = 14 custom = 15 group_placeholder = 999
class ModificationTagStyle(Enum): Unset = 0 ShortId = 1 LongId = 2 ShortName = 3 LongName = 4 class ModificationSourceType(Enum): """ Whether a tag was generated from explicit user input (``Explicit``), a constant modification rule (``Constant``), or from a variable expansion (``Generated``). Used to track sources in :class:`ProteoformCombinator` machinery. """ Explicit = 0 Constant = 1 Generated = 2 _sentinel = object() class ModificationMassNotFoundError(ProFormaError): pass class CompositionNotFoundError(ProFormaError): pass class MissingChargeStateError(ProFormaError): pass class UnknownMonosaccharideError(ProFormaError): pass
[docs] @add_metaclass(PrefixSavingMeta) class TagBase(object): '''A base class for all tag types. Attributes ---------- type: Enum An element of :class:`TagTypeEnum` saying what kind of tag this is. value: object The data stored in this tag, usually an externally controlled name extra: list Any extra tags that were nested within this tag. Usually limited to INFO tags but may be other synonymous controlled vocabulary terms. group_id: str or None A short label denoting which group, if any, this tag belongs to ''' __slots__ = ("type", "value", "extra", "group_id", ) type: TagTypeEnum value: Any extra: List["TagBase"] group_id: Optional[str] prefix_name: ClassVar[Optional[str]] = None short_prefix: ClassVar[Optional[str]] = None prefix_map: ClassVar[Dict[str, Type['TagBase']]] = {}
[docs] def __init__(self, type, value, extra=None, group_id=None): self.type = type self.value = value self.extra = extra or [] self.group_id = group_id
def copy(self): return self.__class__(self.value, [e.copy() for e in self.extra], self.group_id) def __str__(self): part = self._format_main() had_marker = False if self.extra: rest = [] for e in self.extra: rest.append(str(e)) had_marker |= isinstance(e, GroupLabelBase) and e.group_id == self.group_id label = '|'.join([part] + rest) else: label = part if self.group_id and not had_marker: label = '%s%s' % (label, self.group_id) return '%s' % label def __repr__(self): template = "{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})" return template.format(self=self) def __eq__(self, other): if other is None: return False if isinstance(other, str): return str(self) == other return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \ and (self.group_id == other.group_id) def __hash__(self) -> int: return hash((self.type, self.value, tuple(self.extra), self.group_id)) def __ne__(self, other): return not self == other def is_modification(self) -> bool: return self.type in ( TagTypeEnum.formula, TagTypeEnum.generic, TagTypeEnum.glycan, TagTypeEnum.gnome, TagTypeEnum.unimod, TagTypeEnum.massmod, TagTypeEnum.psimod, TagTypeEnum.custom, ) def find_modification(self) -> Optional["TagBase"]: if self.is_modification(): return self for tag in self.extra: if tag.is_modification(): return tag return None
[docs] def find_tag_type(self, tag_type: TagTypeEnum) -> List['TagBase']: '''Search this tag or tag collection for elements with a particular tag type and return them. Parameters ---------- tag_type : TagTypeEnum A label from :class:`TagTypeEnum`, or an equivalent type. Returns ------- matches : list The list of all tags in this object which match the requested tag type. ''' out = [] if self.type == tag_type: out.append(self) if not self.extra: return out for e in self.extra: if e.type == tag_type: out.append(e) return out
@classmethod def parse(cls, buffer) -> 'TagBase': return process_tag_tokens(buffer)
[docs] def has_mass(self) -> bool: """ Check if this tag carries a mass value. Returns ------- bool """ return False
def has_composition(self) -> bool: return False def __or__(self, other): this = self.copy() this.extra.append(other.copy()) return this @property def limit(self) -> int: limit_tags = self.find_tag_type(TagTypeEnum.limit) if limit_tags: if len(limit_tags) > 1: warnings.warn(f"{len(limit_tags)} Limit tags were found for {self}, this is undefined behavior. Only the first tag will be used") return limit_tags[0].value return 1
class GroupLabelBase(TagBase): __slots__ = () def __str__(self): part = self._format_main() if self.extra: rest = [str(e) for e in self.extra] label = '|'.join([part] + rest) else: label = part return '%s' % label def __hash__(self): return hash(str(self))
[docs] class PositionLabelTag(GroupLabelBase): '''A tag to mark that a position is involved in a group in some way, but does not imply any specific semantics. ''' __slots__ = ()
[docs] def __init__(self, value=None, extra=None, group_id=None): assert group_id is not None value = group_id super(PositionLabelTag, self).__init__( TagTypeEnum.position_label, value, extra, group_id)
def _format_main(self): return "{self.group_id}".format(self=self)
[docs] class LocalizationMarker(GroupLabelBase): '''A tag to mark a particular localization site ''' __slots__ = ()
[docs] def __init__(self, value, extra=None, group_id=None): assert group_id is not None super(LocalizationMarker, self).__init__( TagTypeEnum.localization_marker, float(value), extra, group_id)
def _format_main(self): return "{self.group_id}({self.value:.4g})".format(self=self)
[docs] class InformationTag(TagBase): '''A tag carrying free text describing the location ''' __slots__ = () prefix_name = "INFO"
[docs] def __init__(self, value, extra=None, group_id=None): super(InformationTag, self).__init__( TagTypeEnum.info, str(value), extra, group_id)
def _format_main(self): return f"INFO:{self.value}"
class PositionModifierTag(TagBase): __slots__ = () prefix_name = "Position" def __init__(self, value, extra=None, group_id=None): super().__init__(TagTypeEnum.position_modifier, value, extra, group_id) def __eq__(self, other): return super().__eq__(other) def __hash__(self): return hash(self.value) def _format_main(self): return f"{self.prefix_name}:{self.value}" class LimitModifierTag(TagBase): __slots__ = () prefix_name = "Limit" def __init__(self, value, extra=None, group_id=None): if not isinstance(value, (int, float)): try: value = int(value) except (ValueError, TypeError): pass super().__init__(TagTypeEnum.limit, value, extra, group_id) def _format_main(self): return f"{self.prefix_name}:{self.value}" class ColocaliseModificationsOfKnownPostionTag(TagBase): __slots__ = () prefix_name = "ColocaliseModificationsOfKnownPosition" short_prefix = "CoMKP" def __init__(self, extra=None, group_id=None): super().__init__(TagTypeEnum.comkp, None, extra, group_id) def copy(self): return self.__class__([e.copy() for e in (self.extra or [])], self.group_id) def _format_main(self): return self.short_prefix class ColocaliseModificationsOfUnknownPostionTag(TagBase): __slots__ = () prefix_name = "ColocaliseModificationsOfUnknownPosition" short_prefix = "CoMUP" def __init__(self, extra=None, group_id=None): super().__init__(TagTypeEnum.comup, None, extra, group_id) def copy(self): return self.__class__([e.copy() for e in self.extra or []], self.group_id) def _format_main(self): return self.short_prefix
[docs] class ModificationResolver(object): name: str symbol: str _database: Optional[Any] _cache: Optional[Dict[Tuple[Optional[str], Optional[int], frozenset], Any]]
[docs] def __init__(self, name, **kwargs): self.name = name.lower() self.symbol = self.name[0] self._database = None self._cache = {}
[docs] def clear_cache(self): """Clear the modification definition cache""" self._cache.clear()
[docs] def enable_caching(self, flag: bool=True): """ Enable or disable caching of modification definitions. If `flag` is :const:`False`, this will also dispose of any existing cached values. Parameters ---------- flag : :class:`bool` Whether or not to disable the cache """ if flag: if not self._cache: self._cache = {} else: self._cache = None
def load_database(self): raise NotImplementedError() @property def database(self): if not self._database: self._database = self.load_database() return self._database @database.setter def database(self, database): self._database = database
[docs] def parse_identifier(self, identifier: str) -> Tuple[Optional[str], Optional[int]]: """Parse a string that is either a CV prefixed identifier or name. Parameters ---------- identifier : str The identifier string to parse, removing CV prefix as needed. Returns ------- name : str, optional A textual identifier embedded in the qualified identifier, if any, otherwise :const:`None`. id : int, optional An integer ID embedded in the qualified identifier, if any, otherwise :const:`None`. """ tokens = identifier.split(":", 1) if len(tokens) > 1: prefix = tokens[0].lower() if prefix == self.name or prefix == self.symbol: identifier = tokens[1] if identifier.isdigit(): id = int(identifier) name = None else: name = identifier id = None return name, id
def _resolve_impl(self, name: str=None, id: int=None, **kwargs) -> Dict[str, Any]: raise NotImplementedError() def resolve(self, name: str=None, id: int=None, **kwargs): if self._cache is None: return self._resolve_impl(name, id, **kwargs) cache_key = (name, id, frozenset(kwargs.items())) if cache_key in self._cache: return self._cache[cache_key].copy() try: value = self._resolve_impl(name, id, **kwargs) except KeyError: if name.startswith(("+", "-")): value = { "composition": None, "mass": float(name), "name": name, "id": None, "provider": self.name, "source": self, } else: raise self._cache[cache_key] = value return value.copy() def __call__(self, name=None, id=None, **kwargs): return self.resolve(name, id, **kwargs) def __eq__(self, other): return self.name == other.name def __ne__(self, other): return not self == other def __hash__(self): return hash(self.name)
[docs] class UnimodResolver(ModificationResolver):
[docs] def __init__(self, **kwargs): super(UnimodResolver, self).__init__("unimod", **kwargs) self._database = kwargs.get("database") self.strict = kwargs.get("strict", True)
def load_database(self): if _has_psims: return obo_cache.resolve("http://www.unimod.org/obo/unimod.obo") return Unimod() def _resolve_impl(self, name=None, id=None, **kwargs): strict = kwargs.get("strict", self.strict) exhaustive = kwargs.get("exhaustive", True) if name is not None: defn = self.database.by_title(name, strict=strict) if not defn: defn = self.database.by_name(name, strict=strict) if not defn and exhaustive and strict: defn = self.database.by_title(name, strict=False) if not defn: defn = self.database.by_name(name, strict=False) if defn and isinstance(defn, list): warnings.warn( "Multiple matches found for {!r} in Unimod, taking the first, {}.".format( name, defn[0]['record_id'])) defn = defn[0] if not defn: raise KeyError(name) elif id is not None: defn = self.database[id] if not defn: raise KeyError(id) else: raise ValueError("Must provide one of `name` or `id`") if isinstance(defn, dict): return { 'composition': defn['composition'], 'name': defn['title'], 'id': defn['record_id'], 'mass': defn['mono_mass'], 'provider': self.name, "source": self } else: name = defn.ex_code_name if not name: name = defn.code_name return { "composition": defn.composition, "name": name, "id": defn.id, "mass": defn.monoisotopic_mass, "provider": self.name, "source": self }
[docs] class PSIModResolver(ModificationResolver):
[docs] def __init__(self, **kwargs): super(PSIModResolver, self).__init__('psimod', **kwargs) self._database = kwargs.get("database")
def load_database(self): return load_psimod() def _resolve_impl(self, name=None, id=None, **kwargs): if name is not None: defn = self.database[name] elif id is not None: defn = self.database['MOD:{:05d}'.format(id)] else: raise ValueError("Must provide one of `name` or `id`") # Non-standard amino acids are listed with `DiffMono` = `none` # but have a valid `MassMono` definition. Normally, `MassMono` is # the full mass of the residue plus the modification so it'd double count the # amino acid to use that value. Non-standard amino acids are a special case # because they *should* only be used with the amino acid X mass = None for key in ["DiffMono", "MassMono"]: if key in defn: try: mass = float(defn[key]) break except (KeyError, TypeError, ValueError): continue else: raise ModificationMassNotFoundError( "Could not resolve the mass of %r from %r" % ((name, id), defn) ) # As with `DiffMono` for non-standard amino acids, but for chemical formulas -> Compositions for key in ["DiffFormula", "Formula"]: if key in defn and defn[key] is not None: composition = Composition() diff_formula_tokens = defn[key].strip().split(" ") for i in range(0, len(diff_formula_tokens), 2): element = diff_formula_tokens[i] count = diff_formula_tokens[i + 1] if count: count = int(count) if element.startswith("("): j = element.index(")") isotope = element[1:j] element = "%s[%s]" % (element[j + 1:], isotope) composition[element] += count break else: composition = None warnings.warn("No formula was found for %r in PSI-MOD, composition will be missing" % ((name, id), )) return { 'mass': mass, 'composition': composition, 'name': defn.name, 'id': defn.id, 'provider': self.name, "source": self }
[docs] class XLMODResolver(ModificationResolver):
[docs] def __init__(self, **kwargs): super(XLMODResolver, self).__init__('xlmod', **kwargs) self._database = kwargs.get("database")
def load_database(self): return load_xlmod() def _parse_formula(self, formula: str): formula: str = formula.replace("D", "H[2]") tokens = formula.split(' ') composition = Composition() for token in tokens: sign = 1 if token.startswith("-"): token = token[1:] sign = -1 composition += Composition(token) * sign return composition def _resolve_impl(self, name=None, id=None, **kwargs): if name is not None: defn = self.database[name] elif id is not None: defn = self.database['XLMOD:{:05d}'.format(id)] else: raise ValueError("Must provide one of `name` or `id`") try: mass = float(defn['monoIsotopicMass']) except (KeyError, TypeError, ValueError): raise ModificationMassNotFoundError("Could not resolve the mass of %r from %r" % ((name, id), defn)) if 'deadEndFormula' in defn: composition = self._parse_formula(defn["deadEndFormula"]) elif 'bridgeFormula' in defn: composition = self._parse_formula(defn["bridgeFormula"]) return { 'mass': mass, 'composition': composition, 'name': defn.name, 'id': defn.id, 'provider': self.name, "source": self }
# TODO: Implement resolve walking up the graph to get the mass. Can't really # get any more information without glypy/glyspace interaction
[docs] class GNOResolver(ModificationResolver): mass_pattern = re.compile(r"(\d+(:?\.\d+)) Da")
[docs] def __init__(self, **kwargs): super(GNOResolver, self).__init__('gnome', **kwargs) self._database = kwargs.get("database")
def load_database(self): return load_gno()
[docs] def get_mass_from_glycan_composition(self, term): '''Parse the Byonic-style glycan composition from property GNO:00000202 to get the counts of each monosaccharide and use that to calculate mass. The mass computed here is exact and dehydrated, distinct from the rounded-off mass that :meth:`get_mass_from_term` will produce by walking up the CV term hierarchy. However, not all glycan compositions are representable in GNO:00000202 format, so this may silently be absent or incomplete, hence the double-check in :meth:`get_mass_from_term`. Parameters ---------- term : psims.controlled_vocabulary.Entity The CV entity being parsed. Returns ------- mass : float or :const:`None` If a glycan composition is found on the term, the computed mass will be returned. Otherwise the :const:`None` is returned ''' val = term.get('GNO:00000202') monosaccharides = BasicComposition() composition = Composition() if val: tokens = re.findall(r"([A-Za-z0-9]+)\((\d+)\)", val) mass = 0.0 for symbol, count in tokens: count = int(count) try: mono_mass, mono_comp, symbol = GlycanModification.valid_monosaccharides[symbol] mass += mono_mass * count composition += mono_comp * count monosaccharides[symbol] += count except KeyError: continue return mass, monosaccharides, composition return None, None, None
[docs] def get_mass_from_term(self, term, raw_mass): '''Walk up the term hierarchy and find the mass group term near the root of the tree, and return the most accurate mass available for the provided term. The mass group term's mass is rounded to two decimal places, leading to relatively large errors. Parameters ---------- term : psims.controlled_vocabulary.Entity The CV entity being parsed. Returns ------- mass : float or :const:`None` If a root node is found along the term's lineage, computed mass will be returned. Otherwise the :const:`None` is returned. The mass may be ''' root_id = 'GNO:00000001' parent = term.parent() if isinstance(parent, list): parent = parent[0] while parent.id != root_id: next_parent = parent.parent() if isinstance(next_parent, list): next_parent = next_parent[0] if next_parent.id == root_id: break parent = next_parent match = self.mass_pattern.search(parent.name) if not match: return None # This will have a small mass error. rough_mass = float(match.group(1)) - _WATER_MASS if raw_mass is not None and abs(rough_mass - raw_mass) < 1: return raw_mass warnings.warn( ("An accurate glycan composition could not be inferred from %s. " "Only a rough approximation is available.") % (term, )) return rough_mass
def _resolve_impl(self, name=None, id=None, **kwargs): if name is not None: term = self.database[name] elif id is not None: term = self.database[id] else: raise ValueError("Must provide one of `name` or `id`") raw_mass, monosaccharides, composition = self.get_mass_from_glycan_composition(term) rec = { "name": term.name, "id": term.id, "provider": self.name, "composition": composition, "monosaccharides": monosaccharides, "mass": self.get_mass_from_term(term, raw_mass), "source": self } return rec
[docs] class GenericResolver(ModificationResolver):
[docs] def __init__(self, resolvers, **kwargs): super(GenericResolver, self).__init__('generic', **kwargs) self.resolvers = list(resolvers)
def load_database(self): return None
[docs] def parse_identifier(self, identifier): """Parse a string that is either a CV prefixed identifier or name. Does no parsing as a :class:`GenericModification` is never qualified. Parameters ---------- identifier : str The identifier string to parse, removing CV prefix as needed. Returns ------- name : str, optional A textual identifier embedded in the qualified identifier, if any, otherwise :const:`None`. id : int, optional An integer ID embedded in the qualified identifier, if any, otherwise :const:`None`. """ return identifier, None
def _resolve_impl(self, name=None, id=None, **kwargs): defn = None for resolver in self.resolvers: try: defn = resolver(name=name, id=id, **kwargs) break except KeyError: continue except ModificationMassNotFoundError: warnings.warn("Could not resolve the mass for %r in %r" % ((name, id), resolver)) continue if defn is None: if name is None: raise KeyError(id) elif id is None: raise KeyError(name) else: raise ValueError("Must provide one of `name` or `id`") return defn
class CustomResolver(ModificationResolver): store: Dict[str, Dict[str, Any]] def __init__(self, store: Dict[str, Dict[str, Any]]=None, **kwargs): if store is None: store = {} super().__init__("custom", **kwargs) self.store = store def _resolve_impl(self, name = None, id = None, **kwargs): if name is not None: return self.store[name] elif id is not None: return self.store[id] else: raise ValueError("Must provide one of `name` or `id`") def register(self, name, state: Dict[str, Any], **kwargs): state = state.copy() state.update(kwargs) state['id'] = name no_mass = "mass" not in state no_comp = "composition" not in state if no_mass and no_comp: raise ValueError("A custom modification definition *must* include at least one of `mass` or `composition`") self.store[name] = state
[docs] class ModificationBase(TagBase): '''A base class for all modification tags with marked prefixes. While :class:`ModificationBase` is hashable, its equality testing brings in additional tag-related information. For pure modification identity comparison, use :attr:`key` to get a :class:`ModificationToken` free of these concerns. ''' _tag_type = None __slots__ = ('_definition', 'style', '_generated') _generated: ModificationSourceType
[docs] def __init__(self, value, extra=None, group_id=None, style=None): if style is None: style = ModificationTagStyle.Unset super(ModificationBase, self).__init__( self._tag_type, value, extra, group_id) self._definition = None self._generated = ModificationSourceType.Explicit self.style = style
def copy(self): return self.__class__(self.value, [e.copy() for e in self.extra], self.group_id, self.style) def __reduce__(self): return self.__class__, (self.value, self.extra, self.group_id, self.style), self.__getstate__() def __getstate__(self): if self._definition is None: return None state = self._definition.copy() state['source'] = None return state def __setstate__(self, state): self._definition = state def __eq__(self, other): if isinstance(other, ModificationToken): return other == self return super(ModificationBase, self).__eq__(other) def __hash__(self): return hash((self.id, self.provider)) @property def key(self) -> 'ModificationToken': '''Get a safe-to-hash-and-compare :class:`ModificationToken` representing this modification without tag-like properties. Returns -------- ModificationToken ''' return ModificationToken(self.value, self.id, self.provider, self.__class__) @property def definition(self) -> Dict[str, Any]: '''A :class:`dict` of properties describing this modification, given by the providing controlled vocabulary. This value is cached, and should not be modified. Returns ------- dict ''' if self._definition is None: self._definition = self.resolve() return self._definition @property def mass(self) -> Optional[float]: '''The monoisotopic mass shift this modification applies Returns ------- float ''' return self.definition['mass']
[docs] def has_mass(self) -> bool: """ Check if this tag carries a mass value. Returns ------- bool """ return True
def has_composition(self): return True @property def composition(self) -> Optional[Composition]: '''The chemical composition shift this modification applies''' return self.definition.get('composition') @property def charge(self) -> Optional[int]: return self.definition.get('charge') @property def id(self) -> Optional[int]: '''The unique identifier given to this modification by its provider Returns ------- str or int ''' return self.definition.get('id') @property def name(self): '''The primary name of this modification from its provider. Returns ------- str ''' return self.definition.get('name') @property def provider(self): '''The name of the controlled vocabulary that provided this modification. Returns ------- str ''' return self.definition.get('provider') def _populate_from_definition(self, definition): self._definition = definition def _format_main(self) -> str: if self.style == ModificationTagStyle.Unset or self.style is None: return "{self.prefix_name}:{self.value}".format(self=self) elif self.style == ModificationTagStyle.LongId: return "{self.prefix_name}:{self.id}".format(self=self) elif self.style == ModificationTagStyle.ShortId: return "{self.short_prefix}:{self.id}".format(self=self) elif self.style == ModificationTagStyle.LongName: return "{self.prefix_name}:{self.name}".format(self=self) elif self.style == ModificationTagStyle.ShortName: return "{self.short_prefix}:{self.name}".format(self=self) else: warnings.warn("Unknown formatting style {!r}".format(self.style)) return "{self.prefix_name}:{self.value}".format(self=self)
[docs] def resolve(self): '''Find the term and return it's properties ''' keys = self.resolver.parse_identifier(self.value) return self.resolver(*keys)
[docs] class MassModification(TagBase): '''A modification defined purely by a signed mass shift in Daltons. The value of a :class:`MassModification` is always a :class:`float` ''' __slots__ = ('_significant_figures', '_generated') prefix_name = "Obs" _generated: ModificationSourceType
[docs] def __init__(self, value, extra=None, group_id=None): if isinstance(value, str): sigfigs = len(value.split('.')[-1].rstrip('0')) else: sigfigs = 4 self._significant_figures = sigfigs self._generated = ModificationSourceType.Explicit super(MassModification, self).__init__( TagTypeEnum.massmod, float(value), extra, group_id)
def copy(self): return self.__class__(self.value, [e.copy() for e in self.extra], self.group_id) def _format_main(self): if self.value >= 0: return ('+{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.') else: return ('{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.') @property def provider(self): return None @property def id(self): return self._format_main() @property def key(self) -> "ModificationToken": '''Get a safe-to-hash-and-compare :class:`ModificationToken` representing this modification without tag-like properties. Returns -------- ModificationToken ''' return ModificationToken(self.value, self.id, self.provider, self.__class__) @property def mass(self) -> float: return self.value
[docs] def has_mass(self) -> bool: """ Check if this tag carries a mass value. Returns ------- bool """ return True
def has_composition(self) -> bool: return False def __eq__(self, other): if isinstance(other, ModificationToken): return other == self return super(MassModification, self).__eq__(other) def __hash__(self): return hash((self.id, self.provider))
[docs] class FormulaModification(ModificationBase): prefix_name = "Formula" charge_carrier_pattern: re.Pattern = re.compile(r':z((?:-|\+)?\d*)$') isotope_pattern: re.Pattern = re.compile(r'\[(?P<isotope>\d+)(?P<element>[A-Z][a-z]*)(?P<quantity>[\-+]?\d+)\]') _tag_type = TagTypeEnum.formula @staticmethod def _normalize_isotope_notation(match): '''Rewrite ProForma isotope notation to Pyteomics-compatible isotope notation. Parameters ---------- match : Match The matched isotope notation string parsed by the regular expression. Returns reformatted : str The re-written isotope notation ''' parts = match.groupdict() return "{element}[{isotope}]{quantity}".format(**parts) @classmethod def parse(cls, value: str): normalized = value.replace(" ", "") # If there is a [ character in the formula, we know there are isotopes which # need to be normalized. if "[" in normalized: normalized = cls.isotope_pattern.sub( cls._normalize_isotope_notation, normalized ) if ":z" in normalized: matched = cls.charge_carrier_pattern.search(normalized) if not matched: raise ProFormaError( "{normalized!r} is a malformed formula".format( normalized=normalized ), None, None, ) charge = matched.group(1) charge = int(charge) normalized = cls.charge_carrier_pattern.sub("", normalized) else: charge = None composition = Composition(formula=normalized) if charge is not None: composition["e-"] = -charge return composition, charge
[docs] def resolve(self): composition, charge = self.parse(self.value) return { "mass": composition.mass(), "composition": composition, "name": self.value, "charge": charge }
monosaccharide_description = namedtuple('monosaccharide_description', ('mass', 'composition', "symbol"))
[docs] class GlycanModification(ModificationBase): prefix_name = "Glycan" _tag_type = TagTypeEnum.glycan valid_monosaccharides = { "Hex": monosaccharide_description(162.0528, Composition("C6H10O5"), "Hex"), "HexNAc": monosaccharide_description( 203.0793, Composition("C8H13N1O5"), "HexNAc" ), "HexS": monosaccharide_description(242.009, Composition("C6H10O8S1"), "HexS"), "HexP": monosaccharide_description(242.0191, Composition("C6H11O8P1"), "HexP"), "HexNAcS": monosaccharide_description( 283.0361, Composition("C8H13N1O8S1"), "HexNAcS" ), "dHex": monosaccharide_description(146.0579, Composition("C6H10O4"), "dHex"), "NeuAc": monosaccharide_description( 291.0954, Composition("C11H17N1O8"), "NeuAc" ), "NeuGc": monosaccharide_description( 307.0903, Composition("C11H17N1O9"), "NeuGc" ), "Pen": monosaccharide_description(132.0422, Composition("C5H8O4"), "Pen"), "Fuc": monosaccharide_description(146.0579, Composition("C6H10O4"), "Fuc"), "Kdn": monosaccharide_description( 250.06886740546, Composition({"C": 9, "H": 14, "O": 8}), "Kdn" ), "Kdo": monosaccharide_description( 220.05830272176, Composition({"C": 8, "H": 12, "O": 7}), "Kdo" ), "Phospho": monosaccharide_description( 79.96633052075, Composition({"P": 1, "O": 3, "H": 1}), "Phospho" ), "Sulfo": monosaccharide_description( 79.95681485867999, Composition({"S": 1, "O": 3, "H": 0}), "Sulfo" ), } valid_monosaccharides['Neu5Ac'] = valid_monosaccharides['NeuAc'] valid_monosaccharides['Neu5Gc'] = valid_monosaccharides['NeuGc'] valid_monosaccharides['Pent'] = valid_monosaccharides['Pen'] valid_monosaccharides['d-Hex'] = valid_monosaccharides['dHex'] monomer_tokenizer = re.compile( r"|".join(sorted(valid_monosaccharides.keys(), key=len, reverse=True))) tokenizer = re.compile( r"""(?: (?P<known_name>%s)| (?P<base_name>[A-Za-z]+)| (?P<charged_formula>\{ [^\}]+? \}) ) \s*(?P<count>\d*)\s*""" % monomer_tokenizer.pattern, re.X, ) @property def monosaccharides(self): return self.definition.get('monosaccharides')
[docs] def resolve(self): composite = BasicComposition() mass = 0 chemcomp = Composition() charge = 0 for hit in self.tokenizer.finditer(self.value): hit = hit.groupdict() cnt = hit['count'] tok = hit.get('known_name') base_name = hit.get('base_name') formula = hit.get('charged_formula') if cnt: cnt = int(cnt) else: cnt = 1 if tok is not None: if tok not in self.valid_monosaccharides: parts = self.monomer_tokenizer.findall(tok) t = 0 for p in parts: if p not in self.valid_monosaccharides: break t += len(p) if t != len(tok): raise ValueError("{tok!r} is not a valid monosaccharide name".format(tok=tok)) else: for p in parts: if p not in self.valid_monosaccharides: raise UnknownMonosaccharideError(p) m, c, sym = self.valid_monosaccharides[p] mass += m * cnt chemcomp += c * cnt composite[sym] += cnt else: m, c, sym = self.valid_monosaccharides[tok] mass += m * cnt chemcomp += c * cnt composite[sym] += cnt elif formula is not None: inner = FormulaModification(formula[1:-1]).resolve() mass += inner['mass'] * cnt chemcomp += inner['composition'] * cnt composite[formula] += cnt charge += inner['charge'] * cnt elif base_name is not None: parts = self.monomer_tokenizer.findall(base_name) t = 0 for p in parts: if p not in self.valid_monosaccharides: break t += len(p) if t != len(base_name): raise ValueError( f"{base_name!r} is not a valid monosaccharide name" ) else: for p in parts: if p not in self.valid_monosaccharides: raise UnknownMonosaccharideError(p) m, c, sym = self.valid_monosaccharides[p] mass += m * cnt chemcomp += c * cnt composite[sym] += cnt else: raise NotImplementedError(f"I do not know how to decode the impossible, {hit}") return { "mass": mass, "composition": chemcomp, "name": self.value, "monosaccharides": composite }
[docs] class UnimodModification(ModificationBase): __slots__ = () resolver = UnimodResolver() prefix_name = "UNIMOD" short_prefix = "U" _tag_type = TagTypeEnum.unimod
[docs] class PSIModModification(ModificationBase): __slots__ = () resolver = PSIModResolver() prefix_name = "MOD" short_prefix = 'M' _tag_type = TagTypeEnum.psimod
[docs] class GNOmeModification(ModificationBase): __slots__ = () resolver = GNOResolver() prefix_name = "GNO" short_prefix = 'G' _tag_type = TagTypeEnum.gnome @property def monosaccharides(self): return self.definition.get('monosaccharides')
[docs] class XLMODModification(ModificationBase): __slots__ = () resolver = XLMODResolver() prefix_name = "XLMOD" short_prefix = 'X' _tag_type = TagTypeEnum.xlmod
class CustomModification(ModificationBase): __slots__ = () resolver = CustomResolver() prefix_name = 'Custom' short_prefix = 'C' @classmethod def register(cls, name, state: Dict[str, Any], **kwargs): return cls.resolver.register(name, state, **kwargs)
[docs] class GenericModification(ModificationBase): __slots__ = () _tag_type = TagTypeEnum.generic resolver = GenericResolver([ # Do exact matching here first. Then default to non-strict matching as a final # correction effort. partial(UnimodModification.resolver, exhaustive=False), PSIModModification.resolver, XLMODModification.resolver, GNOmeModification.resolver, # Some really common names aren't actually found in the XML exactly, so default # to non-strict matching now to avoid masking other sources here. partial(UnimodModification.resolver, strict=False) ])
[docs] def __init__(self, value, extra=None, group_id=None, style=None): super(GenericModification, self).__init__( value, extra, group_id, style)
def _format_main(self): return self.value
[docs] def resolve(self): '''Find the term, searching through all available vocabularies and return the first match's properties ''' keys = self.resolver.parse_identifier(self.value) defn = self.resolver(*keys) if defn is not None: return defn raise KeyError(keys)
[docs] def set_unimod_path(path): '''Set the path to load the Unimod database from for resolving ProForma Unimod modifications. .. note:: This method ensures that the Unimod modification database loads quickly from a local database file instead of downloading a new copy from the internet. Parameters ---------- path : str or file-like object A path to or file-like object for the "unimod.xml" file. Returns ------- :class:`~pyteomics.mass.mass.Unimod` ''' db = Unimod(path) UnimodModification.resolver.database = db return db
[docs] class ModificationToken(object): '''Describes a particular modification from a particular provider, independent of a :class:`TagBase`'s state. This class is meant to be used in place of a :class:`ModificationBase` object when equality testing and hashing is desired, but do not want extra properties to be involved. :class:`ModificationToken` is comparable and hashable, and can be compared with :class:`ModificationBase` subclass instances safely. It can be called to create a new instance of the :class:`ModificationBase` it is equal to. Attributes ---------- name : str The name of the modification being represented, as the user specified it. id : int or str Whatever unique identifier the providing controlled vocabulary gave to this modification provider : str The name of the providing controlled vocabulary. source_cls : type A sub-class of :class:`ModificationBase` that will be used to fulfill this token if requested, providing it a resolver. ''' __slots__ = ('name', 'id', 'provider', 'source_cls') name: str id: int provider: Callable source_cls: Union[Type[ModificationBase], Type[MassModification], Type['ModificationToken']]
[docs] def __init__(self, name: str, id: int, provider: Callable, source_cls: Type): self.name = name self.id = id self.provider = provider self.source_cls = source_cls
def __eq__(self, other): if other is None: return False if isinstance(other, (ModificationToken, ModificationBase, MassModification)): return self.id == other.id and self.provider == other.provider return False def __ne__(self, other): return not self == other def __hash__(self): return hash((self.id, self.provider)) def __call__(self): '''Create a new :class:`ModificationBase` instance from the provided :attr:`name` against :attr:`source_cls`'s resolver. Returns ------- ModificationBase ''' return self.source_cls(self.name) def __repr__(self): template = "{self.__class__.__name__}({self.name!r}, {self.id!r}, {self.provider!r}, {self.source_cls})" return template.format(self=self)
def split_tags(tokens: List[str]) -> List[List[str]]: '''Split a token array into discrete sets of tag tokens. Parameters ---------- tokens: list The characters of the tag token buffer Returns ------- list of list: The tokens for each contained tag ''' starts = [0] ends = [] for i, c in enumerate(tokens): if c == '|': ends.append(i) starts.append(i + 1) elif (i != 0 and c == '#'): ends.append(i) starts.append(i) ends.append(len(tokens)) out = [] for i, start in enumerate(starts): end = ends[i] tag = tokens[start:end] if len(tag) == 0: continue # Short circuit on INFO tags which can't be broken # if (tag[0] == 'i' and tag[:5] == ['i', 'n', 'f', 'o', ':']) or (tag[0] == 'I' and tag[:5] == ['I', 'N', 'F', 'O', ':']): # tag = tokens[start:] # out.append(tag) # break out.append(tag) return out def find_prefix(tokens: List[str]) -> Tuple[str, str]: '''Find the prefix, if any of the tag defined by `tokens` delimited by ":". Parameters ---------- tokens: list The tag tokens to search Returns ------- prefix: str or None The prefix string, if found rest: str The rest of the tokens, merged as a string ''' for i, c in enumerate(tokens): if c == ':': return ''.join(tokens[:i]), ''.join(tokens[i + 1:]) return None, ''.join(tokens) def process_marker(tokens: Sequence[str]) -> Union[PositionLabelTag, LocalizationMarker]: '''Process a marker, which is a tag whose value starts with #. Parameters ---------- tokens: list or str The tag tokens to parse Returns ------- PositionLabelTag or LocalizationMarker ''' if tokens[1:3] == 'XL': return PositionLabelTag(None, group_id=''.join(tokens)) else: group_id = None value = None for i, c in enumerate(tokens): if c == '(': group_id = ''.join(tokens[:i]) if tokens[-1] != ')': raise Exception( "Localization marker with score missing closing parenthesis") value = float(''.join(tokens[i + 1:-1])) return LocalizationMarker(value, group_id=group_id) else: group_id = ''.join(tokens) return PositionLabelTag(group_id=group_id) def process_tag_tokens(tokens: List[str]) -> TagBase: '''Convert a tag token buffer into a parsed :class:`TagBase` instance of the appropriate sub-type with zero or more sub-tags. Parameters ---------- tokens: list The tokens to parse Returns ------- TagBase: The parsed tag ''' parts = split_tags(tokens) main_tag = parts[0] if main_tag[0] in ('+', '-'): main_tag = ''.join(main_tag) main_tag = MassModification(main_tag) elif main_tag[0] == '#': main_tag = process_marker(main_tag) else: prefix, value = find_prefix(main_tag) if prefix is None: value = ''.join(value) if value.lower() in TagBase.prefix_map: main_tag = TagBase.prefix_map[value.lower()]() else: main_tag = GenericModification(''.join(value)) else: try: tag_type = TagBase.find_by_tag(prefix) main_tag = tag_type(value) except KeyError: main_tag_str = ''.join(main_tag) main_tag = GenericModification(main_tag_str) if len(parts) > 1: extras = [] for part in parts[1:]: prefix, value = find_prefix(part) if prefix is None: if value[0] == "#": marker = process_marker(value) if isinstance(marker, PositionLabelTag): main_tag.group_id = ''.join(value) else: main_tag.group_id = marker.group_id extras.append(marker) else: value = ''.join(value) if value.lower() in TagBase.prefix_map: extra_tag = TagBase.prefix_map[value.lower()]() else: extra_tag = GenericModification("".join(value)) extras.append(extra_tag) else: try: tag_type = TagBase.find_by_tag(prefix) extra_tag = tag_type(value) except KeyError: part_str = ''.join(part) extra_tag = GenericModification(part_str) extras.append(extra_tag) main_tag.extra = extras return main_tag class ModificationTarget(object): aa: Optional[str] n_term: bool c_term: bool def __init__(self, aa, n_term=False, c_term=False): # Normalize amino acid to uppercase once for faster comparisons self.aa = aa.upper() if aa else None self.n_term = n_term self.c_term = c_term def __eq__(self, other): if isinstance(other, str): return str(self) == other else: return ( self.aa == other.aa and self.n_term == other.n_term and self.c_term == other.c_term ) def __ne__(self, other): if isinstance(other, str): return str(self) != other else: return ( self.aa != other.aa or self.n_term != other.n_term or self.c_term != other.c_term ) def __hash__(self): return hash(str(self)) def __str__(self): buffer = [] if self.n_term: buffer.append('N-term') if self.c_term: buffer.append('C-term') if self.aa: buffer.append(self.aa) return ':'.join(buffer) def __repr__(self): return str(self) def is_valid(self, aa: str, n_term: bool, c_term: bool) -> bool: """Check if this target matches the given amino acid and terminal status. Parameters ---------- aa : str The amino acid to check (should be pre-uppercased for efficiency) n_term : bool Whether this is an N-terminal position c_term : bool Whether this is a C-terminal position Returns ------- bool True if this target matches the given criteria """ if (n_term and self.n_term) or (c_term and self.c_term): if (self.aa and aa.upper() == self.aa) or self.aa is None: return True return False return self.aa == aa or self.aa is None @classmethod def from_str(cls, target: str): target_lower = target.lower() if target in VALID_AA: return cls(target, False, False) elif target_lower in ("n-term", "c-term"): n_term = target_lower == "n-term" c_term = target_lower == "c-term" return cls(None, n_term, c_term) elif target_lower.startswith(("n-term:", "c-term:")): tokens = target.split(":") if len(tokens) == 2: if tokens[1] in VALID_AA: t = tokens[0].lower() n_term = t == "n-term" c_term = t == "c-term" return cls(tokens[1], n_term, c_term) else: raise PyteomicsError( "Modification target has an invalid amino acid specific terminal target {1} in {0}".format( target, tokens[1] ) ) else: raise PyteomicsError( "Modification rule target {0} has an empty amino acid specific terminal target".format( target ) ) else: raise PyteomicsError( "Modification rule target {0} is invalid".format( target ) )
[docs] class ModificationRule(object): '''Define a fixed modification rule which dictates a modification tag is always applied at one or more amino acid residues. Attributes ---------- modification_tag: TagBase The modification to apply targets: list The list of amino acids this applies to ''' __slots__ = ('modification_tag', 'targets') modification_tag: TagBase targets: List[ModificationTarget]
[docs] def __init__(self, modification_tag: TagBase, targets: Union[ModificationTarget, List[ModificationTarget], List[str], None]=None): self.modification_tag = modification_tag self.targets = targets # type: ignore self._validate_targets()
[docs] def is_not_specific(self) -> bool: '''If there are no explicit targets, this rule might apply everywhere''' return not self.targets
def is_valid(self, aa: str, n_term: bool, c_term: bool) -> bool: for target in self.targets: if target.is_valid(aa, n_term, c_term): return True return False def _find_all(self, peptide: Iterable[str], n: int) -> Iterator[int]: ''' Tiny helper method to scan over a sequence with an external length and yield matched positions. ''' # decrement the length by 1 so that it matches the last position n -= 1 for i, aa in enumerate(peptide): if self.is_valid(aa, i == 0, i == n): yield i def _validate_targets(self): validated_targets = [] if self.targets is None: self.targets = [] elif not isinstance(self.targets, list): self.targets = [self.targets] for target in self.targets: if isinstance(target, ModificationTarget): validated_targets.append(target) else: try: validated_targets.append(ModificationTarget.from_str(target)) except PyteomicsError as err: raise PyteomicsError(f"While parsing {self}, encountered error {err}") from err self.targets = validated_targets def __eq__(self, other): if other is None: return False return self.modification_tag == other.modification_tag and self.targets == other.targets def __ne__(self, other): return not self == other def __str__(self): targets = ','.join(map(str, self.targets)) return "<[{self.modification_tag}]@{targets}>".format(self=self, targets=targets) def __repr__(self): return "{self.__class__.__name__}({self.modification_tag!r}, {self.targets})".format(self=self)
[docs] class StableIsotope(object): ''' Define a fixed isotope that is applied globally to all amino acids. Attributes ---------- isotope: str The stable isotope string, of the form [<isotope-number>]<element> or a special isotopoform's name. ''' __slots__ = ('isotope', ) isotope: str
[docs] def __init__(self, isotope): self.isotope = isotope
def copy(self): return self.__class__(self.isotope) def __eq__(self, other): if other is None: return False return self.isotope == other.isotope def __ne__(self, other): return not self == other def __str__(self): return "<{self.isotope}>".format(self=self) def __repr__(self): return "{self.__class__.__name__}({self.isotope})".format(self=self)
class IntersectionEnum(Enum): no_overlap = 0 full_contains_interval = 1 full_contained_in_interval = 2 start_overlap = 3 end_overlap = 4
[docs] class TaggedInterval(object): '''Define a fixed interval over the associated sequence which contains the localization of the associated tag or denotes a region of general sequence order ambiguity. Attributes ---------- start: int The starting position (inclusive) of the interval along the primary sequence end: int The ending position (exclusive) of the interval along the primary sequence tags: list[TagBase] The tags being localized ambiguous : bool Whether the interval is ambiguous or not ''' __slots__ = ('start', 'end', 'tags', 'ambiguous') start: int end: Optional[int] tags: Optional[List[TagBase]] ambiguous: bool
[docs] def __init__(self, start, end=None, tags=None, ambiguous=False): self.start = start self.end = end self.tags = tags self.ambiguous = ambiguous
def copy(self): return self.__class__( self.start, self.end, [v.copy() for v in self.tags] if self.tags else [], self.ambiguous ) def __eq__(self, other): if other is None: return False return self.start == other.start and self.end == other.end and self.tags == other.tags def __hash__(self): return hash((self.start, self.end, tuple(self.tags or []), self.ambiguous)) def __ne__(self, other): return not self == other def __str__(self): return f"({'?' if self.ambiguous else ''}{self.start}-{self.end}){self.tags!r}" def __repr__(self): return f"{self.__class__.__name__}({self.start}, {self.end}, {self.tags}, ambiguous={self.ambiguous})" def as_slice(self): return slice(self.start, self.end) def contains(self, i): return self.start <= i < self.end def __contains__(self, i): return self.contains(i) def _check_slice(self, qstart, qend, warn_ambiguous): # Fully contained interval valid = qstart <= self.start and qend >= self.end case = IntersectionEnum.full_contained_in_interval if valid else IntersectionEnum.no_overlap if not valid: # Spans the beginning but not the end valid = qstart <= self.start and qend > self.start if valid: case = IntersectionEnum.start_overlap if warn_ambiguous: warnings.warn("Slice bisecting interval %s" % (self, )) if not valid: # Spans the end but not the beginning valid = qstart < self.end and qend > self.end if valid: case = IntersectionEnum.end_overlap if warn_ambiguous: warnings.warn("Slice bisecting interval %s" % (self, )) if not valid: # Contained interval valid = qstart >= self.start and qend < self.end if valid: case = IntersectionEnum.full_contains_interval if warn_ambiguous: warnings.warn("Slice bisecting interval %s" % (self, )) return valid, case def _update_coordinates_sliced(self, start=None, end=None, warn_ambiguous=True): if end is None: qend = self.end + 1 else: qend = end if start is None: qstart = self.start - 1 else: qstart = start valid, intersection_type = self._check_slice(qstart, qend, warn_ambiguous) if self.ambiguous and intersection_type not in (IntersectionEnum.full_contained_in_interval, IntersectionEnum.no_overlap): raise ValueError("Cannot bisect an ambiguous interval") if not valid: return None new = self.copy() if start is not None: diff = self.start - start if diff < 0: diff = 0 new.start = diff if end is not None: width = min(new.end, end) - self.start else: width = self.end - max(start, self.start) new.end = new.start + width return new
class Adduct(NamedTuple): name: str charge: int count: int def __str__(self): base = f"{self.name}:z{'+' if self.charge > 0 else ''}{self.charge}" if self.count > 1: return base + f"^{self.count}" return base def composition(self) -> Composition: if self.name == 'e-': return Composition({"e-": self.count}) comp = FormulaModification( f"{self.name}:z{'+' if self.charge > 0 else ''}{self.charge}" ).composition * self.count return comp def mass(self) -> float: return self.composition().mass() def total_charge(self) -> int: return self.charge * self.count
[docs] class ChargeState: """Describes the charge and adduct types of the structure. This type *MAY* be coerced to an :class:`int`, in which case it decays to it's :attr:`charge` attribute, an integer. Common methods like :func:`abs` and arithmetic operators will work on this value. Attributes ---------- charge : int The total charge state as a signed number. adducts : list[Adduct] Each charge carrier associated with the molecule. """ __slots__ = ("charge", "adducts") charge: int adducts: List[Adduct] @classmethod def from_adducts(cls, adducts: List[Adduct]): acc = 0 for a in adducts: acc += a.charge * a.count return cls(acc, adducts)
[docs] def __init__(self, charge: int, adducts=None): if adducts is None: if charge > 0: adducts = [Adduct("H", 1, charge)] elif charge < 0: adducts = [Adduct("e-", -1, charge)] else: adducts = [] self.charge = int(charge) self.adducts = adducts
[docs] def is_complete(self) -> bool: """ Test if the total charge recorded here is completely explained by the recorded adducts. If not, then there is a localized charge somewhere on the source proteoform. Returns ------- bool """ return self.charge == sum(a.charge * a.count for a in self.adducts)
def __int__(self) -> int: """ Get the total charge as number. This is equivalent to accessing :attr:`charge` .. note:: This is technically a lossy operation as it discards the distinction between the different charge carriers. """ return self.charge def __float__(self): '''See :meth:`__int__`''' return float(int(self)) def __neg__(self): return -int(self) def __pos__(self): return +int(self) def __abs__(self): """ Return the absolute magnitude of the charge state See Also -------- :meth:`__int__` """ return abs(self.charge) def __mul__(self, other): return int(self) * other def __add__(self, other): return int(self) + other def __sub__(self, other): return int(self) - other def __div__(self, other): return int(self) / other def __rmul__(self, other): return other * int(self) def __radd__(self, other): return other + int(self) def __rsub__(self, other): return other - int(self) def __rdiv__(self, other): return other / int(self) def __eq__(self, other): if not isinstance(other, ChargeState): other = ChargeState(other) return self.charge == other.charge and (self.adducts == other.adducts) def __ne__(self, other): return not self == other
[docs] def for_mz_calculation(self) -> Tuple[float, int]: """ Get the total mass of the charge carrier(s) and their collective charge to plug into the formula for mass-to-charge-ratio, ``(mass of molecule + mass of charge carrier) / charge`` Returns ------- charge_carrier_mass : float The total mass of the charge carriers(s) in the adducting group(s) charge : int The total charge contributed by all the charge carriers in the adducting group(s) """ mass = 0.0 for a in self.adducts: mass += a.mass() return (mass, self.charge)
def composition(self): comp = Composition() for a in self.adducts: comp += a.composition() return comp def format_local(self, local_charge_to_remove: int=0): if self.adducts and (len(self.adducts) > 1 or self.adducts[0].name != "H"): tokens = [] tokens.append("[") tokens.append(",".join((map(str, self.adducts)))) tokens.append("]") return "".join(tokens) else: return f"{self.charge - local_charge_to_remove:d}" def __str__(self): return self.format_local() def __repr__(self): template = "{self.__class__.__name__}({self.charge}, {self.adducts})" return template.format(self=self)
class TokenBuffer(Generic[T]): '''A token buffer that wraps the accumulation and reset logic of a list of :class:`str` objects. Implements a subset of the Sequence protocol. Attributes ---------- buffer: list The list of tokens accumulated since the last parsing. ''' buffer: List[str] boundaries: List[int] def __init__(self, initial=None): self.buffer = list(initial or []) self.boundaries = [] def append(self, c: str): ''' Append a new character to the buffer. Parameters ---------- c: str The character appended ''' self.buffer.append(c) def extend(self, cs: str): ''' Extend the buffer with additional characters Parameters ---------- cs: str The chracters to append ''' self.buffer.extend(cs) def reset(self): '''Discard the content of the current buffer. ''' if self.buffer: self.buffer = [] if self.boundaries: self.boundaries = [] def __bool__(self): return bool(self.buffer) def __iter__(self): return iter(self.buffer) def __getitem__(self, i): return self.buffer[i] def __len__(self): return len(self.buffer) def tokenize(self) -> List[str]: i = 0 pieces = [] for k in self.boundaries + [len(self)]: piece = self.buffer[i:k] i = k pieces.append(piece) return pieces def _transform(self, value: T) -> T: return value def process(self) -> Union[T, List[T]]: if self.boundaries: value = [self._transform(v) for v in self.tokenize()] else: value = self._transform(self.buffer) self.reset() return value def bound(self) -> int: k = len(self) self.boundaries.append(k) return k def __call__(self) -> Union[T, List[T]]: return self.process() class NumberParser(TokenBuffer[int]): '''A buffer which accumulates tokens until it is asked to parse them into :class:`int` instances. ''' def _transform(self, value) -> int: return int(''.join(value)) class StringParser(TokenBuffer[str]): '''A buffer which accumulates tokens until it is asked to parse them into :class:`str` instances. ''' def _transform(self, value) -> str: return ''.join(value) class AdductParser(StringParser): '''A buffer which accumulates tokens related to adducts until it is asked to parse them into a list of [(str, int)] tuples, where the first element is the adduct name and the second element is the number of adducts of that type. ''' token_pattern = re.compile(r'(?P<number>[+-]?\d*)(?P<adduct>[A-Za-z]+)(?P<charge>\d*[+-])') token_pattern2 = re.compile( r"(?P<adduct>[0-9A-Za-z\[\]]+):[zZ](?P<charge>(-|\+)\d+)(?:\^(?P<number>\d+))?" ) def parse_form1(self, token: str) -> Optional[Tuple[str, int, int]]: parsed = self.token_pattern.match(token) if not parsed: return None gdict = parsed.groupdict() if gdict['adduct'] == 'e': adduct = 'e-' else: adduct = gdict['adduct'] if gdict['number'] == '+' or gdict['number'] == '': number = 1 elif gdict['number'] == '-': number = -1 else: number = int(gdict['number']) charge = int(gdict['charge'][:-1]) if gdict['charge'][:-1] else 1 if gdict['charge'][-1] == '-': charge = -charge return (adduct, charge, number) def parse_form2(self, token: str): parsed = self.token_pattern2.match(token) if not parsed: return None gdict = parsed.groupdict() if gdict['adduct'] == 'e': adduct = 'e-' else: adduct = gdict['adduct'] if gdict['number'] == '+' or gdict['number'] == '': number = 1 elif gdict['number'] == '-': number = -1 elif gdict['number'] is not None: number = int(gdict['number']) else: number = 1 charge = int(gdict['charge']) if gdict['charge'] else 1 return (adduct, charge, number) def process(self): value = [] for token in self.tokenize(): if not isinstance(token, str): token = ''.join(token) try: adduct = self.parse_form1(token) if not adduct: adduct = self.parse_form2(token) if not adduct: raise ProFormaError( "Invalid adduct token {!r} in {!r}".format(token, self.buffer) ) value.append(Adduct(*adduct)) except AttributeError: raise ProFormaError("Invalid adduct token {!r} in {!r}".format(token, self.buffer)) return value class TagParser(TokenBuffer[TagBase]): '''A buffer which accumulates tokens until it is asked to parse them into :class:`TagBase` instances. Implements a subset of the Sequence protocol. Attributes ---------- buffer: list The list of tokens accumulated since the last parsing. group_ids: set The set of all group IDs that have been produced so far. ''' def __init__(self, initial=None, group_ids=None): super(TagParser, self).__init__(initial) if group_ids: self.group_ids = set(group_ids) else: self.group_ids = set() def _transform(self, value): tag = process_tag_tokens(value) if tag.group_id: self.group_ids.add(tag.group_id) return tag def process(self): value = super(TagParser, self).process() if not isinstance(value, list): value = [value] return value class ParserStateEnum(Enum): before_sequence = 0 tag_before_sequence = 1 global_tag = 2 fixed_spec = 3 labile_tag = 4 sequence = 5 tag_in_sequence = 6 interval_tag = 7 tag_after_sequence = 8 stable_isotope = 9 post_tag_before = 10 unlocalized_count = 11 post_global = 12 post_global_aa = 13 post_interval_tag = 14 post_tag_after = 15 charge_state_start = 16 charge_state_number = 17 charge_state_adduct_start = 18 charge_state_adduct_end = 19 inter_chain_cross_link_start = 20 chimeric_start = 21 interval_initial = 22 post_global_terminal = 23 peptidoform_name_start = 24 peptidoform_name_level = 25 peptidoform_name_text = 26 peptidoform_name_close = 27 done = 999 BEFORE = ParserStateEnum.before_sequence TAG_BEFORE = ParserStateEnum.tag_before_sequence FIXED = ParserStateEnum.fixed_spec GLOBAL = ParserStateEnum.global_tag ISOTOPE = ParserStateEnum.stable_isotope LABILE = ParserStateEnum.labile_tag SEQ = ParserStateEnum.sequence TAG = ParserStateEnum.tag_in_sequence INTERVAL_TAG = ParserStateEnum.interval_tag INTERVAL_INIT = ParserStateEnum.interval_initial TAG_AFTER = ParserStateEnum.tag_after_sequence POST_TAG_BEFORE = ParserStateEnum.post_tag_before POST_TAG_AFTER = ParserStateEnum.post_tag_after UNLOCALIZED_COUNT = ParserStateEnum.unlocalized_count POST_GLOBAL = ParserStateEnum.post_global POST_GLOBAL_AA = ParserStateEnum.post_global_aa POST_GLOBAL_TERM = ParserStateEnum.post_global_terminal POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag CHARGE_START = ParserStateEnum.charge_state_start CHARGE_NUMBER = ParserStateEnum.charge_state_number ADDUCT_START = ParserStateEnum.charge_state_adduct_start ADDUCT_END = ParserStateEnum.charge_state_adduct_end PEPTIDOFORM_NAME_START = ParserStateEnum.peptidoform_name_start PEPTIDOFORM_NAME_LEVEL = ParserStateEnum.peptidoform_name_level PEPTIDOFORM_NAME_TEXT = ParserStateEnum.peptidoform_name_text PEPTIDOFORM_NAME_CLOSE = ParserStateEnum.peptidoform_name_close DONE = ParserStateEnum.done VALID_AA_UPPER = set("QWERTYIPASDFGHKLCVNMXUOJZB") VALID_AA = {s.lower() for s in VALID_AA_UPPER} | VALID_AA_UPPER TERMINAL_SPEC_CHARS = set('N-term') | set('C-term') | set("ncT: ") def _local_charges( position_list, intervals: List[TaggedInterval], unlocalized_modifications: List[TagBase], labile_modifications: List[TagBase], fixed_modifications: List[TagBase] ) -> Tuple[int, int]: """ Count the number of localized charges that the parsed ProForma sequence has. This specifically counts modifications with a registered charge state like charged :class:`FormulaModification` instances. Returns ------- local_charges : int The total charge state attributable to localized modifications n_charged_modifications : int The number of charged modifications on the sequence """ local_charges = 0 n_charged_modifications = 0 for _, tags in position_list: for tag in tags or (): # tags may be None z_of = getattr(tag, "charge", 0) if z_of: n_charged_modifications += 1 local_charges += z_of for iv in intervals: for tag in iv.tags or (): z_of = getattr(tag, "charge", 0) if z_of: n_charged_modifications += 1 local_charges += z_of for tag in unlocalized_modifications: z_of = getattr(tag, "charge", 0) if z_of: n_charged_modifications += 1 local_charges += z_of for tag in labile_modifications: z_of = getattr(tag, "charge", 0) if z_of: n_charged_modifications += 1 local_charges += z_of for fixed_mod in fixed_modifications: z_of = getattr(fixed_mod.modification_tag, "charge", 0) if z_of: for _ in fixed_mod._find_all((aa for aa, _ in position_list), len(position_list)): local_charges += z_of n_charged_modifications += 1 return local_charges, n_charged_modifications class Parser: """ A parser for the ProForma 2 syntax. Attributes ---------- sequence : str The sequence to be parsed index : int The current index parsing from depth : int The current depth of the brace type being parsed within length : int The total length in characters of the sequence to parse state : ParserStateEnum The state of the parser is currently in, dictating how it will interpret the next token read. """ sequence: str index: int depth: int length: int state: ParserStateEnum labile_modifications: List[TagBase] fixed_modifications: List[ModificationRule] unlocalized_modifications: List[TagBase] intervals: List[TaggedInterval] isotopes: List[StableIsotope] n_term: List[TagBase] c_term: List[TagBase] positions: List current_aa: str current_interval: Optional[TaggedInterval] current_tag: TagParser current_unlocalized_count: NumberParser current_aa_targets: StringParser charge_buffer: Optional[NumberParser] adduct_buffer: Optional[AdductParser] def __init__(self, sequence: str, case_sensitive_aa: bool=False): """ Instantiate a ProForma 2 parser for the specified sequence. Parameters ---------- sequence : str The sequence to parse case_sensitive_aa : bool Whether to treat amino acids as case sensitive (older behavior) while the specification states they should be handled insensitively. """ self.sequence = sequence self.index = 0 self.depth = 0 self.length = len(sequence) self.state = ParserStateEnum.before_sequence self._VALID_AA = VALID_AA if not case_sensitive_aa else VALID_AA_UPPER self.n_term = [] self.c_term = [] self.intervals = [] self.positions = [] self.adduct_buffer = None self.charge_buffer = None self.current_aa = None self.current_interval = None self.current_tag = TagParser() self.current_aa_targets = StringParser() self.current_unlocalized_count = NumberParser() self.unlocalized_modifications = [] self.labile_modifications = [] self.fixed_modifications = [] self.unlocalized_modifications = [] self.intervals = [] self.isotopes = [] self.name_level = None self.name_buffer = StringParser() self.names = {} @property def i(self) -> int: return self.index @i.setter def i(self, value: int): self.index = value @property def n(self) -> int: return self.length def pack_sequence_position(self): self.positions.append( ( self.current_aa, self.current_tag() if self.current_tag else None, ) ) self.current_aa = None def handle_before(self, c: str): if c == '[': self.state = TAG_BEFORE self.depth = 1 elif c == '{': self.state = LABILE self.depth = 1 elif c == '<': self.state = FIXED elif c in self._VALID_AA: self.current_aa = c self.state = SEQ elif c == '(': if (self.index + 1) < self.length: if self.sequence[self.index + 1] == '>': self.index += 1 self.state = PEPTIDOFORM_NAME_LEVEL self.depth = 1 self.name_level = 1 else: self.state = INTERVAL_INIT self.current_interval = TaggedInterval(len(self.positions) + 1) else: raise ProFormaError( f"Error In State {self.state}, unexpected {c} found at index {self.i}", self.i, self.state, ) def handle_seq(self, c: str): state = self.state if state == INTERVAL_INIT: self.state = SEQ if c == '?': if self.current_interval is not None: self.current_interval.ambiguous = True # continue return True if c in self._VALID_AA: if self.current_aa is not None: self.pack_sequence_position() self.current_aa = c elif c == '[': self.state = TAG if self.current_tag: self.current_tag.bound() self.depth = 1 elif c == '(': if self.current_interval is not None: raise ProFormaError( ( f"Error In State {self.state}, nested range found at index {self.index}. " "Nested ranges are not yet supported by ProForma." ), self.index, self.state, ) self.current_interval = TaggedInterval(len(self.positions) + 1) self.state = INTERVAL_INIT elif c == ')': self.pack_sequence_position() if self.current_interval is None: raise ProFormaError( f"Error In State {self.state}, unexpected {c} found at index {self.index}", self.index, self.state, ) else: self.current_interval.end = len(self.positions) if self.i + 1 < self.n and self.sequence[self.i + 1] == "[": self.i += 1 self.depth = 1 self.state = INTERVAL_TAG else: self.intervals.append(self.current_interval) self.current_interval = None elif c == '-': if self.current_aa: self.pack_sequence_position() self.state = TAG_AFTER if self.i >= self.n or self.sequence[self.i + 1] != "[": raise ProFormaError("Missing Opening Tag", self.i, self.state) self.i += 1 self.depth = 1 elif c == '/': self.state = CHARGE_START self.charge_buffer = NumberParser() self.adduct_buffer = AdductParser() elif c == '+': raise ProFormaError( f"Error In State {self.state}, {c} found at index {self.i}. Chimeric representation not supported", self.i, self.state, ) else: raise ProFormaError( f"Error In State {self.state}, unexpected {c} found at index {self.i}", self.i, self.state, ) def handle_tag(self, c: str): if c == "[": self.depth += 1 self.current_tag.append(c) elif c == "]": self.depth -= 1 if self.depth <= 0: self.depth = 0 if self.state == TAG: self.state = SEQ elif self.state == TAG_BEFORE: self.state = POST_TAG_BEFORE elif self.state == TAG_AFTER: self.c_term = self.current_tag() self.state = POST_TAG_AFTER elif self.state == GLOBAL: self.state = POST_GLOBAL elif self.state == INTERVAL_TAG: self.state = POST_INTERVAL_TAG # self.current_interval.tags.append(self.current_tag()) self.depth = 0 else: self.current_tag.append(c) else: self.current_tag.append(c) def handle_fixed(self, c: str): if c == '[': self.state = GLOBAL else: # Do validation here self.state = ISOTOPE self.current_tag.reset() self.current_tag.append(c) def handle_isotope(self, c: str): if c != ">": self.current_tag.append(c) else: # Not technically a tag, but exploits the current buffer self.isotopes.append(StableIsotope("".join(self.current_tag))) self.current_tag.reset() self.state = BEFORE def handle_labile(self, c: str): if c == "{": self.depth += 1 elif c == "}": self.depth -= 1 if self.depth <= 0: self.depth = 0 self.labile_modifications.append(self.current_tag()[0]) self.state = BEFORE else: self.current_tag.append(c) def handle_post_interval_tag(self, c: str): if c == "[": self.current_tag.bound() self.state = INTERVAL_TAG elif c in self._VALID_AA: self.current_aa = c self.current_interval.tags = self.current_tag() self.intervals.append(self.current_interval) self.current_interval = None self.state = SEQ elif c == "-": self.state = TAG_AFTER # Unroll next state to immediately fall into a tag parsing state instead of # including a separate post-dash state if self.i >= self.n or self.sequence[self.i] != "[": raise ProFormaError("Missing Closing Tag", self.i, self.state) self.i += 1 self.depth = 1 elif c == "/": self.state = CHARGE_START self.charge_buffer = NumberParser() elif c == "+": raise ProFormaError( f"Error In State {self.state}, {self.c} found at index {self.i}. Chimeric representation not supported", self.i, self.state, ) else: raise ProFormaError( f"Error In State {self.state}, unexpected {self.c} found at index {self.i}", self.i, self.state, ) def handle_post_tag_before(self, c: str): if c == "?": self.unlocalized_modifications.extend(self.current_tag()) self.state = BEFORE elif c == "-": self.n_term = self.current_tag() self.state = BEFORE elif c == "^": self.state = UNLOCALIZED_COUNT elif c == "[": self.current_tag.bound() self.state = TAG_BEFORE else: raise ProFormaError( f"Error In State {self.state}, unexpected {self.c} found at index {self.i}", self.i, self.state, ) def handle_unlocalized_count(self, c: str): if c.isdigit(): self.current_unlocalized_count.append(c) elif c == "[": self.state = TAG_BEFORE self.depth = 1 tags = self.current_tag() tags, tag = tags[:-1], tags[-1] self.unlocalized_modifications.extend(tags) multiplicity = self.current_unlocalized_count() for _ in range(multiplicity): self.unlocalized_modifications.append(tag) elif c == "?": self.state = BEFORE tags = self.current_tag() tags, tag = tags[:-1], tags[-1] self.unlocalized_modifications.extend(tags) multiplicity = self.current_unlocalized_count() for _ in range(multiplicity): self.unlocalized_modifications.append(tag) else: raise ProFormaError( f"Error In State {self.state}, unexpected {c} found at index {self.i}", self.i, self.state, ) def handle_post_global(self, c: str): if c == "@": self.state = POST_GLOBAL_AA else: raise ProFormaError( ( f"Error In State {self.state}, fixed modification detected without " f"target amino acids found at index {self.i}" ), self.i, self.state, ) def handle_post_global_aa(self, c: str): if c in self._VALID_AA or c in TERMINAL_SPEC_CHARS: self.current_aa_targets.append(c) elif c == ",": # the next character should be another amino acid self.current_aa_targets.bound() elif c == ">": try: v = self.current_aa_targets() self.fixed_modifications.append( ModificationRule(self.current_tag()[0], v) ) except PyteomicsError as err: raise ProFormaError( ( f"Error In State {self.state}, fixed modification detected invalid " f"target found at index {self.i}: {err}" ), self.i, self.state, ) self.state = BEFORE else: raise ProFormaError( f"Error In State {self.state}, unclosed fixed modification rule", self.i, self.state, ) def handle_post_tag_after(self, c: str): if c == "/": self.state = CHARGE_START self.charge_buffer = NumberParser() elif c == "+": raise ProFormaError( f"Error In State {self.state}, {c} found at index {self.i}. Chimeric representation not supported", self.i, self.state, ) def handle_charge_start(self, c: str): if c in "+-": self.charge_buffer.append(c) self.state = CHARGE_NUMBER elif c.isdigit(): self.charge_buffer.append(c) self.state = CHARGE_NUMBER elif c == "/": self.state = ParserStateEnum.inter_chain_cross_link_start raise ProFormaError( "Inter-chain cross-linked peptides are not yet supported", self.i, self.state, ) elif c == '[': self.state = ParserStateEnum.charge_state_adduct_start else: raise ProFormaError( f"Error In State {self.state}, unexpected {c} found at index {self.i}", self.i, self.state, ) def handle_charge_number(self, c: str): if c.isdigit(): self.charge_buffer.append(c) elif c == "[": self.state = ADDUCT_START self.depth = 1 self.adduct_buffer = AdductParser() else: raise ProFormaError( f"Error In State {self.state}, unexpected {c} found at index {self.i}", self.i, self.state, ) def handle_adduct_start(self, c: str): if c.isdigit() or c in "^+:-" or c.isalpha(): self.adduct_buffer.append(c) elif c == "[": self.depth += 1 self.adduct_buffer.append(c) elif c == ",": self.adduct_buffer.bound() elif c == "]": self.depth -= 1 if self.depth == 0: self.state = ADDUCT_END else: self.adduct_buffer.append(c) def handle_adduct_end(self, c: str): if c == "+": raise ProFormaError( f"Error In State {self.state}, {c} found at index {self.i}. Chimeric representation not supported", self.i, self.state, ) def handle_name_level(self, c: str): if c == '>' and self.name_level < 3: self.name_level += 1 elif c == ')': self.names[self.name_level] = self.name_buffer() self.name_level = 0 self.state = BEFORE self.depth = 0 else: self.name_buffer.append(c) self.state = PEPTIDOFORM_NAME_TEXT def handle_name_text(self, c: str): if c == ')': self.depth -= 1 if self.depth <= 0: name = self.name_buffer() self.names[self.name_level] = name self.state = BEFORE else: self.name_buffer.append(c) elif c == '(': self.depth += 1 self.name_buffer.append(c) else: self.name_buffer.append(c) def step(self) -> bool: if self.index < self.length: c = self.sequence[self.index] # Initial state prior to sequence content if self.state == BEFORE: self.handle_before(c) # The body of the amino acid sequence. elif self.state == SEQ or self.state == INTERVAL_INIT: self.handle_seq(c) # Tag parsing which rely on `current_tag` to buffer tokens. elif ( self.state == TAG or self.state == TAG_BEFORE or self.state == TAG_AFTER or self.state == GLOBAL or self.state == INTERVAL_TAG ): self.handle_tag(c) # Handle transition to fixed modifications or isotope labeling from opening signal. elif self.state == FIXED: self.handle_fixed(c) # Handle fixed isotope rules, which rely on `current_tag` to buffer tokens elif self.state == ISOTOPE: self.handle_isotope(c) # Handle labile modifications, which rely on `current_tag` to buffer tokens elif self.state == LABILE: self.handle_labile(c) # The intermediate state between an interval tag and returning to sequence parsing. # A new tag may start immediately, leading to it being appended to the interval instead # instead of returning to the primary sequence. Because this state may also occur at the # end of a sequence, it must also handle sequence-terminal transitions like C-terminal tags, # charge states, and the like. elif self.state == POST_INTERVAL_TAG: self.handle_post_interval_tag(c) # An intermediate state for discriminating which type of tag-before-sequence type # we just finished parsing. elif self.state == POST_TAG_BEFORE: self.handle_post_tag_before(c) elif self.state == UNLOCALIZED_COUNT: self.handle_unlocalized_count(c) elif self.state == POST_GLOBAL: self.handle_post_global(c) elif self.state == POST_GLOBAL_AA: self.handle_post_global_aa(c) elif self.state == POST_TAG_AFTER: self.handle_post_tag_after(c) elif self.state == CHARGE_START: self.handle_charge_start(c) elif self.state == CHARGE_NUMBER: self.handle_charge_number(c) elif self.state == ADDUCT_START: self.handle_adduct_start(c) elif self.state == ADDUCT_END: self.handle_adduct_end(c) elif self.state == PEPTIDOFORM_NAME_LEVEL: self.handle_name_level(c) elif self.state == PEPTIDOFORM_NAME_TEXT: self.handle_name_text(c) else: raise ProFormaError( f"Error In State {self.state}, unexpected {c} found at index {self.i}", self.i, self.state, ) self.index += 1 return self.index < self.length def finish( self, ) -> Tuple[List[Tuple[str, Optional[List[TagBase]]]], Dict[str, Any]]: """ Post-process the parser's accumulated parsed token data and return the parsed sequence and metadata. Returns ------- sequence : List[Tuple[str, Optional[List[TagBase]]]] The primary amino acid sequence of the ProForma string metadata : Dict[str, Any] All other information outside the main sequence, including unlocalized, labile, or global modifications, names, charge states, and more. """ if self.charge_buffer: charge_number = self.charge_buffer() if self.adduct_buffer: adducts = self.adduct_buffer() else: adducts = None charge_state = ChargeState(charge_number, adducts) elif self.adduct_buffer: adducts = self.adduct_buffer() charge_state = ChargeState.from_adducts(adducts) else: charge_state = None if self.current_aa: self.pack_sequence_position() z, k = self._local_charges() if k: if charge_state is None: charge_state = ChargeState(0) charge_state.charge += z if self.state in ( ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ): raise ProFormaError( f"Error In State {self.state}, unclosed group reached end of string!", self.i, self.state, ) return self.positions, { "n_term": self.n_term, "c_term": self.c_term, "unlocalized_modifications": self.unlocalized_modifications, "labile_modifications": self.labile_modifications, "fixed_modifications": self.fixed_modifications, "intervals": self.intervals, "isotopes": self.isotopes, "group_ids": sorted(self.current_tag.group_ids), "charge_state": charge_state, "names": self.names } def _local_charges(self) -> Tuple[int, int]: return _local_charges( self.positions, self.intervals, self.unlocalized_modifications, self.labile_modifications, self.fixed_modifications ) def parse(self): while self.step(): pass return self.finish() def __call__(self, *args, **kwds): return self.parse() @staticmethod def empty_properties(): return { 'n_term': [], 'c_term': [], 'unlocalized_modifications': [], 'labile_modifications': [], 'fixed_modifications': [], 'intervals': [], 'isotopes': [], 'group_ids': [], 'charge_state': None, 'names': {} }
[docs] def parse(sequence: str, **kwargs) -> Tuple[List[Tuple[str, Optional[List[TagBase]]]], Dict[str, Any]]: """ Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a mapping of sequence-spanning modifiers. .. note:: This is a state machine parser, but with certain sub-state paths unrolled to avoid an explosion of formal intermediary states. Parameters ---------- sequence: str The sequence to parse **kwargs : Forwarded to :class:`Parser` Returns ------- parsed_sequence: list[tuple[str, list[TagBase]]] The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence modifiers: dict A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized modifications, tagged intervals, and group IDs """ # short-circuiting the parser for simple sequences with no tags or modifications to avoid overhead if sequence.isupper() and sequence.isalpha(): return ( [(aa, None) for aa in sequence], Parser.empty_properties() ) parser = Parser(sequence, **kwargs) return parser.parse()
def _parse(sequence): '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a mapping of sequence-spanning modifiers. .. warning:: This is the older parser which was designed for ProForma v2.0. There are some syntactic constructs that were introduced in v2.1 that are not compatible. This function is retained for handling sequences in the older format only. .. note:: This is a state machine parser, but with certain sub-state paths unrolled to avoid an explosion of formal intermediary states. Parameters ---------- sequence: str The sequence to parse Returns ------- parsed_sequence: list[tuple[str, list[TagBase]]] The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence modifiers: dict A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized modifications, tagged intervals, and group IDs ''' labile_modifications = [] fixed_modifications = [] unlocalized_modifications = [] intervals = [] isotopes = [] n_term = None c_term = None i = 0 n = len(sequence) positions = [] state = BEFORE depth = 0 current_aa = None current_tag = TagParser() current_interval = None current_unlocalized_count = NumberParser() current_aa_targets = StringParser() charge_buffer = None adduct_buffer = None # A mostly context free finite state machine unrolled # by hand. while i < n: c = sequence[i] i += 1 # Initial state prior to sequence content if state == BEFORE: if c == '[': state = TAG_BEFORE depth = 1 elif c == '{': state = LABILE depth = 1 elif c == '<': state = FIXED elif c in VALID_AA: current_aa = c state = SEQ else: raise ProFormaError( f"Error In State {state}, unexpected {c} found at index {i}", i, state) # The body of the amino acid sequence. elif state == SEQ or state == INTERVAL_INIT: if state == INTERVAL_INIT: state = SEQ if c == '?': if current_interval is not None: current_interval.ambiguous = True continue if c in VALID_AA: if current_aa is not None: positions.append((current_aa, current_tag() if current_tag else None)) current_aa = c elif c == '[': state = TAG if current_tag: current_tag.bound() depth = 1 elif c == '(': if current_interval is not None: raise ProFormaError( ("Error In State {state}, nested range found at index {i}. " "Nested ranges are not yet supported by ProForma.").format( **locals()), i, state) current_interval = TaggedInterval(len(positions) + 1) state = INTERVAL_INIT elif c == ')': positions.append( (current_aa, current_tag() if current_tag else None)) current_aa = None if current_interval is None: raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) else: current_interval.end = len(positions) if i < n and sequence[i] == '[': i += 1 depth = 1 state = INTERVAL_TAG else: intervals.append(current_interval) current_interval = None elif c == '-': if current_aa: positions.append((current_aa, current_tag() if current_tag else None)) current_aa = None state = TAG_AFTER if i >= n or sequence[i] != '[': raise ProFormaError("Missing Closing Tag", i, state) i += 1 depth = 1 elif c == '/': state = CHARGE_START charge_buffer = NumberParser() elif c == '+': raise ProFormaError( "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) else: raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) # Tag parsing which rely on `current_tag` to buffer tokens. elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL or state == INTERVAL_TAG: if c == '[': depth += 1 current_tag.append(c) elif c == ']': depth -= 1 if depth <= 0: depth = 0 if state == TAG: state = SEQ elif state == TAG_BEFORE: state = POST_TAG_BEFORE elif state == TAG_AFTER: c_term = current_tag() state = POST_TAG_AFTER elif state == GLOBAL: state = POST_GLOBAL elif state == INTERVAL_TAG: state = POST_INTERVAL_TAG depth = 0 else: current_tag.append(c) else: current_tag.append(c) # Handle transition to fixed modifications or isotope labeling from opening signal. elif state == FIXED: if c == '[': state = GLOBAL else: # Do validation here state = ISOTOPE current_tag.reset() current_tag.append(c) # Handle fixed isotope rules, which rely on `current_tag` to buffer tokens elif state == ISOTOPE: if c != '>': current_tag.append(c) else: # Not technically a tag, but exploits the current buffer isotopes.append(StableIsotope(''.join(current_tag))) current_tag.reset() state = BEFORE # Handle labile modifications, which rely on `current_tag` to buffer tokens elif state == LABILE: if c == '{': depth += 1 elif c == '}': depth -= 1 if depth <= 0: depth = 0 labile_modifications.append(current_tag()[0]) state = BEFORE else: current_tag.append(c) # The intermediate state between an interval tag and returning to sequence parsing. # A new tag may start immediately, leading to it being appended to the interval instead # instead of returning to the primary sequence. Because this state may also occur at the # end of a sequence, it must also handle sequence-terminal transitions like C-terminal tags, # charge states, and the like. elif state == POST_INTERVAL_TAG: if c == '[': current_tag.bound() state = INTERVAL_TAG elif c in VALID_AA: current_aa = c current_interval.tags = current_tag() intervals.append(current_interval) current_interval = None state = SEQ elif c == '-': state = TAG_AFTER if i >= n or sequence[i] != '[': raise ProFormaError("Missing Closing Tag", i, state) i += 1 depth = 1 elif c == '/': state = CHARGE_START charge_buffer = NumberParser() elif c == '+': raise ProFormaError( "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) else: raise ProFormaError( "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) # An intermediate state for discriminating which type of tag-before-sequence type # we just finished parsing. elif state == POST_TAG_BEFORE: if c == '?': unlocalized_modifications.append(current_tag()[0]) state = BEFORE elif c == '-': n_term = current_tag() state = BEFORE elif c == '^': state = UNLOCALIZED_COUNT else: raise ProFormaError( "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) elif state == UNLOCALIZED_COUNT: if c.isdigit(): current_unlocalized_count.append(c) elif c == '[': state = TAG_BEFORE depth = 1 tag = current_tag()[0] multiplicity = current_unlocalized_count() for _ in range(multiplicity): unlocalized_modifications.append(tag) elif c == '?': state = BEFORE tag = current_tag()[0] multiplicity = current_unlocalized_count() for _ in range(multiplicity): unlocalized_modifications.append(tag) else: raise ProFormaError( "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) elif state == POST_GLOBAL: if c == '@': state = POST_GLOBAL_AA else: raise ProFormaError( ("Error In State {state}, fixed modification detected without " "target amino acids found at index {i}").format(**locals()), i, state) elif state == POST_GLOBAL_AA: if c in VALID_AA or c in TERMINAL_SPEC_CHARS: current_aa_targets.append(c) elif c == ',': # the next character should be another amino acid current_aa_targets.bound() elif c == '>': try: v = current_aa_targets() fixed_modifications.append( ModificationRule(current_tag()[0], v)) except PyteomicsError as err: raise ProFormaError( ( "Error In State {state}, fixed modification detected invalid " "target found at index {i}: {err}" ).format(state=state, i=i, err=err), i, state, ) state = BEFORE else: raise ProFormaError( ("Error In State {state}, unclosed fixed modification rule").format(**locals()), i, state) elif state == POST_TAG_AFTER: if c == '/': state = CHARGE_START charge_buffer = NumberParser() elif c == '+': raise ProFormaError( "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) elif state == CHARGE_START: if c in '+-': charge_buffer.append(c) state = CHARGE_NUMBER elif c.isdigit(): charge_buffer.append(c) state = CHARGE_NUMBER elif c == '/': state = ParserStateEnum.inter_chain_cross_link_start raise ProFormaError("Inter-chain cross-linked peptides are not yet supported", i, state) else: raise ProFormaError( "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) elif state == CHARGE_NUMBER: if c.isdigit(): charge_buffer.append(c) elif c == "[": state = ADDUCT_START adduct_buffer = AdductParser() else: raise ProFormaError( "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) elif state == ADDUCT_START: if c.isdigit() or c in "+-" or c.isalpha(): adduct_buffer.append(c) elif c == ',': adduct_buffer.bound() elif c == ']': state = ADDUCT_END elif state == ADDUCT_END: if c == '+': raise ProFormaError( "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) else: raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) if charge_buffer: charge_number = charge_buffer() if adduct_buffer: adducts = adduct_buffer() else: adducts = None charge_state = ChargeState(charge_number, adducts) else: charge_state = None if current_aa: positions.append((current_aa, current_tag() if current_tag else None)) if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ): raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state) return positions, { 'n_term': n_term, 'c_term': c_term, 'unlocalized_modifications': unlocalized_modifications, 'labile_modifications': labile_modifications, 'fixed_modifications': fixed_modifications, 'intervals': intervals, 'isotopes': isotopes, 'group_ids': sorted(current_tag.group_ids), 'charge_state': charge_state, }
[docs] def to_proforma( sequence, n_term: Optional[List[TagBase]] = None, c_term: Optional[List[TagBase]] = None, unlocalized_modifications: Optional[List[TagBase]] = None, labile_modifications: Optional[List[TagBase]] = None, fixed_modifications: Optional[List[TagBase]] = None, intervals: Optional[List[TaggedInterval]]=None, isotopes: Optional[List[StableIsotope]] = None, charge_state: Optional[ChargeState]=None, group_ids: Iterable[str]=None, names: Optional[Dict[int, str]] = None, ): '''Convert a sequence plus modifiers into formatted text following the ProForma specification. Parameters ---------- sequence : list[tuple[str, TagBase]] The primary sequence of the peptidoform/proteoform to render n_term : Optional[TagBase] The N-terminal modification, if any. c_term : Optional[TagBase] The C-terminal modification, if any. unlocalized_modifications : Optional[list[TagBase]] Any modifications which aren't assigned to a specific location. labile_modifications : Optional[list[TagBase]] Any labile modifications fixed_modifications : Optional[list[ModificationRule]] Any fixed modifications intervals : Optional[list[TaggedInterval]] A list of modified intervals, if any isotopes : Optional[list[StableIsotope]] Any global stable isotope labels applied charge_state : Optional[ChargeState] An optional charge state value group_ids : Optional[list[str]] Any group identifiers. This parameter is currently not used. Returns ------- str ''' if names is None: names = {} buffer = [] if 3 in names: buffer.append("(>>>") buffer.append(names[3]) buffer.append(")") if isotopes: for iso in isotopes: buffer.append(str(iso)) if fixed_modifications: for rule in fixed_modifications: buffer.append(str(rule)) if 2 in names: buffer.append("(>>") buffer.append(names[2]) buffer.append(")") if 1 in names: buffer.append("(>") buffer.append(names[1]) buffer.append(")") primary = deque() for aa, tags in sequence: if not tags: primary.append(str(aa)) else: primary.append(str(aa) + "".join(["[{0!s}]".format(t) for t in tags])) if intervals: for iv in sorted(intervals, key=lambda x: x.start): if iv.ambiguous: primary[iv.start] = "(?" + primary[iv.start] else: primary[iv.start] = "(" + primary[iv.start] terminator = "{0!s})".format(primary[iv.end - 1]) if iv.tags: terminator += "".join("[{!s}]".format(t) for t in iv.tags) primary[iv.end - 1] = terminator if n_term: primary.appendleft("".join("[{!s}]".format(t) for t in n_term) + "-") if c_term: primary.append("-" + "".join("[{!s}]".format(t) for t in c_term)) if charge_state: local_charge, _charge_mod_count = _local_charges( sequence, fixed_modifications=fixed_modifications, intervals=intervals, unlocalized_modifications=unlocalized_modifications, labile_modifications=labile_modifications, ) if local_charge != charge_state.charge: primary.append("/{!s}".format(charge_state.format_local(local_charge_to_remove=local_charge))) if labile_modifications: primary.extendleft(["{{{!s}}}".format(m) for m in labile_modifications]) if unlocalized_modifications: primary.appendleft("?") primary.extendleft(["[{!s}]".format(m) for m in unlocalized_modifications]) primary.appendleft("".join(buffer)) return "".join(primary)
class _ProFormaProperty(Generic[T]): def __init__(self, name): self.name = name def __get__(self, obj, cls) -> T: return obj.properties[self.name] def __set__(self, obj, value: T): obj.properties[self.name] = value def __repr__(self): template = "{self.__class__.__name__}({self.name!r})" return template.format(self=self)
[docs] class ProForma(object): '''Represent a parsed ProForma sequence. The preferred way to instantiate this class is via the :meth:`parse` method. Attributes ---------- sequence : list[tuple[str, List[TagBase]]] The list of (amino acid, tag collection) pairs making up the primary sequence of the peptide. isotopes : list[StableIsotope] A list of any stable isotope rules that apply to this peptide charge_state : int, optional An optional charge state that may have been provided intervals : list[Interval] Any annotated intervals that contain either sequence ambiguity or a tag over that interval. labile_modifications : list[ModificationBase] Any modifications that were parsed as labile, and may not appear at any location on the peptide primary sequence. unlocalized_modifications : list[ModificationBase] Any modifications that were not localized but may be attached to peptide sequence evidence. n_term : list[ModificationBase] Any modifications on the N-terminus of the peptide c_term : list[ModificationBase] Any modifications on the C-terminus of the peptide group_ids : set The collection of all groupd identifiers on this sequence. mass : float The computed mass for the fully modified peptide, including labile and unlocalized modifications. **Does not include stable isotopes at this time** ''' sequence: List[Tuple[str, Optional[List[TagBase]]]] properties: Dict[str, Any]
[docs] def __init__(self, sequence, properties): """ Initialize a :class:`ProForma` instance from a parse tree. To construct an instance from a string directly, see :meth:`ProForma.parse`. See Also -------- :meth:`ProForma.parse` """ self.sequence = sequence self.properties = properties
isotopes = _ProFormaProperty[List[StableIsotope]]('isotopes') _charge_state = _ProFormaProperty('charge_state') intervals = _ProFormaProperty[List[TaggedInterval]]('intervals') fixed_modifications = _ProFormaProperty[List[ModificationRule]]("fixed_modifications") labile_modifications = _ProFormaProperty[List[TagBase]]('labile_modifications') unlocalized_modifications = _ProFormaProperty[List[TagBase]]("unlocalized_modifications") n_term = _ProFormaProperty[List[TagBase]]("n_term") c_term = _ProFormaProperty[List[TagBase]]("c_term") names = _ProFormaProperty[Dict[int, str]]("names") group_ids = _ProFormaProperty('group_ids') def __str__(self): return to_proforma(self.sequence, **self.properties) def __repr__(self): return "{self.__class__.__name__}({self.sequence}, {self.properties})".format(self=self) def __len__(self): return len(self.sequence) def __getitem__(self, i: Union[int, slice]): if isinstance(i, slice): props = self.properties.copy() ivs = [] for iv in props['intervals']: iv = iv._update_coordinates_sliced( i.start, i.stop) if iv is None: continue ivs.append(iv) props['intervals'] = ivs if not (i.start is None or i.start == 0): props['n_term'] = None n = len(self) if not (i.stop is None or i.stop >= n): props['c_term'] = None subseq = self.__class__(self.sequence[i], props) if subseq.group_ids: kept_group_ids = [] for group_id in subseq.group_ids: tag_hits = subseq.find_tags_by_id(group_id, include_position=True) if not tag_hits: continue kept_group_ids.append(group_id) # We sliced the sequence, but only the localization markers were captured, # not the actual modification definition. Update the first occurrence of the # localization marker with a group id marked modification tag. if all(not isinstance(v, LocalizationMarker) for _, v in tag_hits): i = tag_hits[0] val: TagBase for val in self.find_tags_by_id(group_id, include_position=False): if not isinstance(val, LocalizationMarker): val = val.copy() for j, tag in enumerate(subseq[i][1]): if tag.group_id == group_id: subseq[i][1][j] = val return subseq else: return self.sequence[i] def __setitem__(self, i, val: Tuple[str, Optional[List[TagBase]]]): self.sequence[i] = val def __eq__(self, other): if isinstance(other, str): return str(self) == other elif other is None: return False else: return self.sequence == other.sequence and self.properties == other.properties def __ne__(self, other): return not self == other def __iter__(self): return iter(self.sequence) @property def charge_state(self) -> Optional[ChargeState]: """ Access the :class:`ChargeState` property of the :class:`ProForma` instance, which includes the total charge state and adduct list. This implies that you have a peptidoform *ion*, not a neutral peptide. """ z = self._charge_state return z def _local_charges(self) -> Tuple[int, int]: """ Count the number of localized charges that the :class:`ProForma` sequence has. This specifically counts modifications with a registered charge state like charged :class:`FormulaModification` instances. Returns ------- local_charges : int The total charge state attributable to localized modifications n_charged_modifications : int The number of charged modifications on the sequence """ return _local_charges( self.sequence, self.intervals, self.unlocalized_modifications, self.labile_modifications, self.fixed_modifications ) @charge_state.setter def charge_state(self, value: Union[int, ChargeState, None]): """ Sets the charge state of the :class:`ProForma` instance. When setting with :const:`None`, this removes any existing charge state information. When setting with an :class:`int`, this removes the adduct information. Parameters ---------- value : :class:`int`, :class:`ChargeState`, or :const:`None` When setting with :const:`None`, this removes any existing charge state information. When setting with an :class:`int`, this removes the adduct information. """ if value is None: self._charge_state = None elif isinstance(value, ChargeState): self._charge_state = value else: value = int(value) existing = self._charge_state new = ChargeState(value) if existing is None: self._charge_state = new else: if len(existing.adducts) > 1 or existing.adducts[0].name != "H": warnings.warn(f"Overwriting {existing}'s charge value with {value}, replacing adducts with protons") self._charge_state = new
[docs] @classmethod def parse(cls, string, **kwargs): '''Parse a ProForma string. Parameters ---------- string : str The string to parse **kwargs : Forwarded to :class:`Parser` Returns ------- ProForma ''' return cls(*parse(string, **kwargs))
@property def mass(self) -> float: ''' Compute the *total* monoisotopic neutral mass of the peptidoform. This does not include the adduct. ''' mass = 0.0 fixed_modifications = self.properties['fixed_modifications'] n_term_v = 0 c_term_v = len(self) - 1 for i, position in enumerate(self.sequence): aa = position[0].upper() try: mass += std_aa_mass[aa] except KeyError: warnings.warn("%r does not have an exact mass" % (aa, )) n_term = i == n_term_v c_term = i == c_term_v for rule in fixed_modifications: if rule.is_valid(aa, n_term, c_term): mod: ModificationBase = rule.modification_tag if mod.has_mass(): mass += mod.mass tags = position[1] if tags: for tag in tags: if tag.has_mass(): mass += tag.mass for mod in self.properties['labile_modifications']: mass += mod.mass for mod in self.properties['unlocalized_modifications']: mass += mod.mass if self.properties.get('n_term'): for mod in self.properties['n_term']: if mod.has_mass(): mass += mod.mass mass += calculate_mass(formula="H") if self.properties.get('c_term'): for mod in self.properties['c_term']: if mod.has_mass(): mass += mod.mass mass += calculate_mass(formula="OH") for iv in self.properties['intervals']: for tag in iv.tags: if tag.has_mass(): mass += tag.mass return mass
[docs] def mz(self, charge: Union[int, ChargeState, None] = None, **kwargs) -> float: """ Compute the *total* m/z of the peptidoform in the specified charge state, or fall back to the peptidoform ion's defined charge state and adduction. This method first tries to get the composition of the peptidoform ion with :meth:`composition` and then forwards ``kwargs`` to :meth:`Composition.mass` to compute m/z with full flexibility, but if that fails due to missing modification compositions, this method falls back to directly computing monoisotopic mass and uses the charge state to get the m/z. .. warning:: If no charge state of any kind is available, this will raise a :py:class:`MissingChargeStateError`. Parameters ---------- charge : int or :class:`ChargeState`, optional The charge state either as in integer number of protons gained/lost, or a :class:`ChargeState` instance. If not provided, :attr:`charge_state` will be used. **kwargs : Forwarded to :meth:`Composition.mass` Returns ------- float """ charge_state = charge if charge_state is None: charge_state = self.charge_state elif isinstance(charge_state, Integral): charge_state = ChargeState(int(charge_state)) elif not isinstance(charge_state, ChargeState): raise TypeError( f"Expected a charge state-like type, got {type(charge_state)}" ) if charge_state is None: raise MissingChargeStateError( f"Requested an m/z value without providing a charge state and the peptidoform {self!r} does " "not have a charge state itself." ) if charge_state and self.charge_state is not None and charge_state != self.charge_state: # Alternatively, try to guess from total charge and adducts all_charge_explained_by_adducts = self.charge_state.is_complete() if not all_charge_explained_by_adducts: warnings.warn( "Overriding charge state on a ProForma sequence with a charged modification. Only the overriding charge state will be used" ) try: composition = self.composition(include_charge=False, ignore_missing=False) charge = charge_state.charge return composition.mass(charge=charge, charge_carrier=charge_state.composition(), carrier_charge=charge, **kwargs) except ProFormaError: charge_carrier_mass, charge = charge_state.for_mz_calculation() return (self.mass + charge_carrier_mass) / abs(charge)
[docs] def fragments(self, ion_shift, charge=1, reverse=None, include_labile=True, include_unlocalized=True): """ The function generates all possible fragments of the requested series type. Parameters ---------- ion_shift : float or str The mass shift of the ion series, or the name of the ion series charge : int The charge state of the theoretical fragment masses to generate. Defaults to 1+. If 0 is passed, neutral masses will be returned. reverse : bool, optional Whether to fragment from the N-terminus (``False``) or C-terminus (``True``). If ``ion_shift`` is a :class:`str`, the terminal will be inferred from the series name. Otherwise, defaults to ``False``. include_labile : bool, optional Whether or not to include dissociated modification masses. Defaults to ``True`` include_unlocalized : bool, optional Whether or not to include unlocalized modification masses. Defaults to ``True`` Returns ------- np.ndarray Examples -------- >>> p = proforma.ProForma.parse("PEPTIDE") >>> p.fragments('b', charge=1) array([ 98.06004032, 227.1026334 , 324.15539725, 425.20307572, 538.2871397 , 653.31408272]) >>> p.fragments('y', charge=1) array([148.06043424, 263.08737726, 376.17144124, 477.21911971, 574.27188356, 703.31447664]) """ if isinstance(ion_shift, str): if ion_shift[0] in 'xyz': reverse = True ion_shift = std_ion_comp[ion_shift].mass(absolute=False) z = self.charge_state if z and not z.is_complete() and charge != 0 and charge != z: warnings.warn("A localized charge modification was detected. Its charge contribution will be ignored") n = len(self.sequence) masses = _array('d') mass = 0 mass += ion_shift fixed_modifications = self.properties['fixed_modifications'] intervals = self.intervals if intervals: intervals = sorted(intervals, key=lambda x: x.start, reverse=reverse) intervals = deque(intervals) if not include_labile: for mod in self.properties['labile_modifications']: mass += mod.mass if not reverse: if self.properties.get('n_term'): for mod in self.properties['n_term']: if mod.has_mass(): mass += mod.mass else: if self.properties.get('c_term'): for mod in self.properties['c_term']: if mod.has_mass(): mass += mod.mass if include_unlocalized: for mod in self.properties['unlocalized_modifications']: if mod.has_mass(): mass += mod.mass mass += _WATER_MASS if not reverse: iterator = (iter(range(0, n - 1))) n_term_v = 0 c_term_v = n - 1 else: iterator = (reversed(range(1, n))) n_term_v = n - 1 c_term_v = 0 for i in iterator: position = self.sequence[i] aa = position[0].upper() if aa != 'X': try: mass += std_aa_mass[aa] except KeyError: warnings.warn("%r does not have an exact mass" % (aa, )) n_term = i == n_term_v c_term = i == c_term_v for rule in fixed_modifications: if rule.is_valid(aa, n_term, c_term): if rule.modification_tag.has_mass(): mass += rule.modification_tag.mass tags = position[1] if tags: for tag in tags: if tag.has_mass(): mass += tag.mass while intervals and intervals[0].contains(i): iv = intervals.popleft() for tag in iv.tags: if tag.has_mass(): mass += tag.mass masses.append(mass) if np is not None: masses = np.asarray(masses) if charge != 0: return mass_charge_ratio(masses, charge) return masses if charge != 0: for i, mass in enumerate(masses): masses[i] = mass_charge_ratio(mass, charge) return masses
[docs] def find_tags_by_id(self, tag_id, include_position=True): '''Find all occurrences of a particular tag ID Parameters ---------- tag_id : str The tag ID to search for include_position : bool Whether or not to return the locations for matched tag positions Returns ------- list[tuple[Any, TagBase]] or list[TagBase] ''' if not tag_id.startswith("#"): tag_id = "#" + tag_id matches = [] for i, (_token, tags) in enumerate(self.sequence): if tags: for tag in tags: if tag.group_id == tag_id: if include_position: matches.append((i, tag)) else: matches.append(tag) for iv in self.properties['intervals']: if iv.tag.group_id == tag_id: matches.append((iv, iv.tag) if include_position else iv.tag) for ulmod in self.properties['unlocalized_modifications']: if ulmod.group_id == tag_id: matches.append(('unlocalized_modifications', ulmod) if include_position else ulmod) for lamod in self.properties['labile_modifications']: if lamod.group_id == tag_id: matches.append(('labile_modifications', lamod) if include_position else lamod) return matches
@property def tags(self): return [tag for tags_at in [pos[1] for pos in self if pos[1]] for tag in tags_at]
[docs] def proteoforms(self, include_unmodified: bool = False, include_labile: bool = False, strip: bool = False, deepcopy: bool = False) -> Iterator["ProForma"]: """ Generate combinatorial localizations of modifications defined on this ProForma sequence. Parameters ---------- include_unmodified : :class:`bool` For all non-fixed modifications, include the case where the modification is not included anywhere. This is equivalent to how variable modification rules are applied in search engines. It still respects the number of copies of modifications included in the input. See ``expand_rules``. include_labile : :class:`bool` For all labile modifications, include the case where the modification is localized at every possible location or as a remaining labile modification. strip : :class:`bool` If :const:`True`, the generated peptidoforms will have all modification tags stripped of any extra information, leaving only the bare modification definition. deepcopy : :class:`bool` If :const:`True`, the generated peptidoforms will have all tags and modifications deep-copied. This is necessary if the generated peptidoforms will be modified in-place after generation, but adds overhead if they will be treated as immutable. Defaults to :const:`False`. Yields ------ :class:`ProForma` """ return iter(ProteoformCombinator(self, include_unmodified=include_unmodified, include_labile=include_labile, strip=strip, deepcopy=deepcopy))
peptidoforms = proteoforms def copy(self) -> "ProForma": sequence = [] for (aa, tags) in self: if tags: tags = [t.copy() for t in tags] sequence.append((aa, tags)) properties = self.properties.copy() for k in [ "labile_modifications", "n_term", "c_term", "unlocalized_modifications", "fixed_modifications", "isotopes", ]: properties[k] = [v.copy() for v in properties[k]] properties['names'] = properties['names'].copy() return self.__class__(sequence, properties)
[docs] def composition(self, include_charge: Union[bool, ChargeState]=False, aa_comp=None, ignore_missing=False) -> Composition: ''' Calculate the elemental composition of the ProForma sequence. Parameters ---------- include_charge : bool or :class:`ChargeState`, optional If True, then :attr:`charge_state` will be included in the composition. If a :class:`ChargeState` instance is passed, this charge and adduction will be included instead. Otherwise, composition of the neutral molecule will be returned. Defaults to False. aa_comp : dict, optional A dictionary mapping amino acid symbols to their respective compositions. If not provided, the standard amino acid composition will be used. ``X`` *always* has a mass of 0.0, regardless of this argument. ignore_missing : bool, optional If True, tags with missing composition will be silently ignored. If False (default), a :py:class:`CompositionNotFoundError` will be raised. .. note:: Amino acids not found in `aa_mass` will result in errors even with `ignore_missing=True`. Returns ------- Composition :py:class:`Composition` object representing the composition of the ProForma sequence. ''' if ignore_missing: def get_comp(tag): try: return tag.composition or Composition({}) except AttributeError: return Composition({}) else: def get_comp(tag): try: comp = tag.composition except AttributeError as e: raise CompositionNotFoundError(f'No composition found for tag {tag}') from e if comp is None: raise CompositionNotFoundError(f'No composition found for tag {tag}') return comp comp = Composition() if aa_comp is None: aa_comp = std_aa_comp try: for i, (aa, tags) in enumerate(self.sequence): if aa != 'X': comp += aa_comp[aa] for tag in tags or []: if not tag.has_composition(): if isinstance(tag, MassModification) and not ignore_missing: raise CompositionNotFoundError(f"No composition found for tag {tag}") continue comp += get_comp(tag) for rule in self.fixed_modifications: if rule.is_valid(aa, i == 0, i == len(self.sequence) - 1): comp += get_comp(rule.modification_tag) for tag in chain(self.labile_modifications, self.unlocalized_modifications, chain.from_iterable(interval.tags for interval in self.intervals)): comp += get_comp(tag) except KeyError as e: raise CompositionNotFoundError(f'No composition found for amino acid {aa}') from e if self.n_term: comp += get_comp(self.n_term) else: comp['H'] += 1 # Add hydrogen for N-terminus if self.c_term: comp += get_comp(self.c_term) else: comp += Composition({'O': 1, 'H': 1}) # Add -OH for C-terminus if include_charge and self.charge_state: comp += self.charge_state.composition() return comp
class GeneratorModificationRuleDirective: """ A helper for :class:`ProteoformCombinator` that maps modification rules to sequence locations. This type probably shouldn't be created directly. """ rule: ModificationRule region: Optional[TaggedInterval] = None colocal_known: bool = False colocal_unknown: bool = False limit: int = 1 labile: bool = False token: Optional[ModificationToken] = None strip: bool = False def __eq__(self, other): if other is None: return False return ( self.rule == other.rule and self.region == other.region and self.colocal_known == other.colocal_known and self.colocal_unknown == other.colocal_unknown and self.limit == other.limit and self.labile == other.labile ) def __ne__(self, other): return not self == other def __hash__(self): return hash(self.token) def __init__(self, rule, region=None, colocal_known: bool = False, colocal_unknown: bool = False, limit: int = 1, labile: bool = False, strip: bool = False): self.rule = rule self.region = region self.colocal_known = colocal_known self.colocal_unknown = colocal_unknown self.limit = limit self.labile = labile self.strip = strip self.token = getattr(self.rule.modification_tag, "key", None) def create(self) -> TagBase: tag = self.rule.modification_tag.copy() if self.strip: tag.extra.clear() return tag def __repr__(self): return f"{self.__class__.__name__}({self.rule}, {self.region}, {self.colocal_known}, {self.colocal_unknown})" @staticmethod def _extract_rule_features(tag: TagBase) -> Tuple[List[Any], bool, bool]: targets = [] colocal_known = False colocal_unknown = False for extra in tag.extra: if extra.type == TagTypeEnum.position_modifier: targets.append(extra.value) elif extra.type == TagTypeEnum.comkp: colocal_known = True elif extra.type == TagTypeEnum.comup: colocal_unknown = True return targets, colocal_known, colocal_unknown def _can_apply_with(self, tags) -> bool: if not tags: return True known = [] unknown = [] for tag in tags: if isinstance(tag, (ModificationBase, MassModification)): if tag._generated in (ModificationSourceType.Explicit, ModificationSourceType.Constant): known.append(tag) elif tag._generated == ModificationSourceType.Generated: unknown.append(tag) total_at = len(known) + len(unknown) can_known = bool((known and self.colocal_known) or not known) can_unknown = bool((unknown and self.colocal_unknown) or not unknown) return ((can_known and can_unknown) or (not can_known and not can_unknown)) and total_at < self.limit def find_positions(self, sequence: ProForma) -> List[int]: n = len(sequence) - 1 positions = [] group_id = self.rule.modification_tag.group_id for i, (aa, tags) in enumerate(sequence): if self.region and not self.region.contains(i): continue elif group_id is not None: if not tags: continue for tag in tags: # TODO: Implement combinatoric limits here if tag.group_id == group_id: positions.append(i) else: if self.rule.is_not_specific() or self.rule.is_valid(aa, i == 0, i == n): if self._can_apply_with(tags): positions.append(i) return positions @classmethod @memoize() def from_tagged_modification(cls, tag: TagBase, strip: bool = False) -> "GeneratorModificationRuleDirective": mod = tag.find_modification() if not mod: return elif not mod.group_id: return rule = ModificationRule(tag, []) _targets, colocal_known, colocal_unknown = cls._extract_rule_features(tag) return cls(rule, None, colocal_known, colocal_unknown, tag.limit, strip=strip) @classmethod @memoize() def from_unlocalized_rule(cls, tag: TagBase, strip: bool = False) -> "GeneratorModificationRuleDirective": mod = tag.find_modification() if not mod: return targets, colocal_known, colocal_unknown = cls._extract_rule_features(tag) rule = ModificationRule(modification_tag=mod, targets=targets) return cls(rule, None, colocal_known, colocal_unknown, tag.limit, strip=strip) @classmethod @memoize() def from_region_rule(cls, region: TaggedInterval, strip: bool = False) -> List['GeneratorModificationRuleDirective']: rules = [] for tag in (region.tags or []): mod = tag.find_modification() if not mod: continue targets, colocal_known, colocal_unknown = cls._extract_rule_features(tag) rule = ModificationRule(modification_tag=mod, targets=targets) rules.append(cls(rule, region, colocal_known, colocal_unknown, tag.limit, strip=strip)) return rules @classmethod @memoize() def from_labile_rule(cls, tag: TagBase, strip: bool = False) -> "GeneratorModificationRuleDirective": mod = tag.find_modification() if not mod: return targets, colocal_known, colocal_unknown = cls._extract_rule_features(tag) targets = [ModificationTarget(v) for v in targets] rule = ModificationRule(modification_tag=mod, targets=targets) return cls(rule, None, colocal_known, colocal_unknown, tag.limit, labile=True, strip=strip) def _coerce_string_to_modification(item, copy: bool = False) -> TagBase: if isinstance(item, TagBase): return item.copy() if copy else item elif isinstance(item, str): if copy: return TagParser(item)()[0] return _coerce_string_to_modification_cached(item) else: raise TypeError(f"Don't know how to coerce {item} of type {type(item)} to a modification") @memoize() def _coerce_string_to_modification_cached(item: str) -> TagBase: return TagParser(item)()[0] def peptidoforms( peptide: Union[ProForma, str], variable_modifications: Optional[ Union[ List[Union[TagBase, str]], dict[Union[TagBase, str], List[Union[str, TagBase]]], ] ] = None, fixed_modifications: Optional[ Union[ List[Union[TagBase, str]], dict[Union[TagBase, str], List[Union[str, TagBase]]], ] ] = None, include_unmodified: bool = True, include_labile: bool = False, expand_rules: bool = False, strip: bool = False, deepcopy: bool = False, ) -> Iterator[ProForma]: """ Generate the combinatorial cross-product of modifications for ``peptide``, given by a set of variable and fixed modification rules, as in a classical peptide search engine. This is similar to :func:`parser.peptidoforms`, but using :class:`ProForma` as the representation. This uses ProForma 2.1's position limiting rules to give the caller greater control over how modifications are applied, if desired. Internally, this delegates to :class:`ProteoformCombinator` and would mirror the behavior of embedding all of the modification rules directly in the sequence and calling :meth:`ProForma.proteoforms`. Parameters ---------- peptide : :class:`ProForma` or :class:`str` The base peptide to modify. If a string is provided, it will be parsed with :meth:`ProForma.parse`. If ``peptide`` itself encodes modification rules or unlocalized modifications of any kind, they **will** also be applied. variable_modifications : :class:`list` of :class:`str` or :class:`TagBase` modification rules, or a :class:`dict` mapping :class:`str` or :class:`TagBase` modifications to a list of :class:`str` or :class:`TagBase` targets The variable modifications that will be combined. If a list is provided, the values are assumed to either be strings encoding a modification tag in ProForma notation or pre-parsed :class:`TagBase` modifications with position limiting rules added with ``|`` separators. If a :class:`dict` is provided, keys are assumed to be :class:`TagBase` modifications, as in the list-case, but the values of those keys are expected to be :class:`TagBase` position limiters like :class:`PositionModifierTag`, or strings that will be coerced as such. fixed_modifications : :class:`list` of :class:`str` or :class:`TagBase` modification rules, or a :class:`dict` mapping :class:`str` or :class:`TagBase` modifications to a list of :class:`str` or :class:`TagBase` targets The fixed modifications that will be applied to all combinations, even the unmodified version if ``include_unmodified`` is specified. See ``variable_modifications`` for an explanation of type coercion. include_unmodified : :class:`bool` For all non-fixed modifications, include the case where the modification is not included anywhere. include_labile : :class:`bool` For all labile modifications, include the case where the modification is localized at every possible location. .. note :: Along with all possible localizations of the labile modifications, the output will contain peptidoform(s) where it is kept as a labile modification without localization. expand_rules : :class:`bool` For all variable modifications, allow any number of copies of the modification to be included in the result. This mirrors the expected behavior of many search engines' variable modification rules, though it is not strictly how ProForma's rules work. This forces :attr:`include_unmodified` to be :const:`True`. strip : :class:`bool` If :const:`True`, the generated peptidoforms will have all modification tags stripped of any extra information, leaving only the bare modification definition. deepcopy : :class:`bool` If :const:`True`, the generated peptidoforms will be deep-copied from the original before modifications are applied. This is more expensive, but allows for the generated peptidoforms to be modified independently of the original. If :const:`False`, modifications will be applied in-place on the original and yielded as-is, which is more efficient but means that modifying the generated peptidoforms will have unpredictable side effects. Yields ------ :class:`ProForma` Examples -------- This example shows how to use the :class:`dict`-based modification rule approach. >>> from pyteomics import proforma >>> isos = proforma.peptidoforms( ... "EMEVTESPEK", ... variable_modifications={"Oxidation": ['M']}) >>> for i in isos: ... print(i) EMEVTESPEK EM[Oxidation|Position:M]EVTESPEK Using parsed objects to get the equivalent behavior, and avoids needing to re-parse the rules on every invocation. >>> from pyteomics import proforma >>> pforms = proforma.peptidoforms( ... ProForma.parse("EMEVTESPEK"), ... variable_modifications={proforma.GenericModification("Oxidation"): [proforma.PositionModifierTag('M')]}) >>> for i in pforms: ... print(i) EMEVTESPEK EM[Oxidation|Position:M]EVTESPEK To expand rules so that they might apply to as many positions as are available, as is often done when build a combinatorial search space, use the ``expand_rules`` argument. >>> from pyteomics import proforma >>> isos = proforma.peptidoforms( ... "EMEVTESPEK", ... variable_modifications={"Oxidation": ['M'], "Phospho": ['S', 'T']}, expand_rules=True) >>> for i in isos: ... print(i) EM[Oxidation|Position:M]EVT[Phospho|Position:T]S[Phospho|Position:S]ES[Phospho|Position:S]PEK EMEVT[Phospho|Position:T]S[Phospho|Position:S]ES[Phospho|Position:S]PEK EM[Oxidation|Position:M]EVTS[Phospho|Position:S]ES[Phospho|Position:S]PEK EMEVTS[Phospho|Position:S]ES[Phospho|Position:S]PEK EM[Oxidation|Position:M]EVT[Phospho|Position:T]S[Phospho|Position:S]ESPEK EMEVT[Phospho|Position:T]S[Phospho|Position:S]ESPEK EM[Oxidation|Position:M]EVTS[Phospho|Position:S]ESPEK EMEVTS[Phospho|Position:S]ESPEK EM[Oxidation|Position:M]EVT[Phospho|Position:T]SES[Phospho|Position:S]PEK EMEVT[Phospho|Position:T]SES[Phospho|Position:S]PEK EM[Oxidation|Position:M]EVTSES[Phospho|Position:S]PEK EMEVTSES[Phospho|Position:S]PEK EM[Oxidation|Position:M]EVT[Phospho|Position:T]SESPEK EMEVT[Phospho|Position:T]SESPEK EM[Oxidation|Position:M]EVTSESPEK EMEVTSESPEK """ if isinstance(peptide, str): peptide = ProForma.parse(peptide) if expand_rules: include_unmodified = True template = peptide.copy() seen = set() if variable_modifications: if isinstance(variable_modifications, list): extra_rules = [] for rule in map(partial(_coerce_string_to_modification, copy=deepcopy), variable_modifications): if expand_rules: parsed_rule = GeneratorModificationRuleDirective.from_unlocalized_rule( rule, strip=strip ) extra_rules.extend([rule] * len(parsed_rule.find_positions(template) * parsed_rule.limit)) else: extra_rules.append(rule) template.unlocalized_modifications.extend(extra_rules) elif isinstance(variable_modifications, dict): extra_rules = [] for tag, targets in variable_modifications.items(): seen.clear() tag = _coerce_string_to_modification(tag, copy=deepcopy) for target in targets: if isinstance(target, str): target = PositionModifierTag(target) if target in seen: continue seen.add(target) tag = tag | target if expand_rules: rule = GeneratorModificationRuleDirective.from_unlocalized_rule(tag, strip=strip) n = len(rule.find_positions(peptide)) * rule.limit extra_rules.extend([tag] * n) else: extra_rules.append(tag) template.unlocalized_modifications.extend(extra_rules) else: raise TypeError(f"Expected variable_modifications to be a list or a dict, got {type(variable_modifications)}") if fixed_modifications: if isinstance(fixed_modifications, list): template.fixed_modifications.extend(map(partial(_coerce_string_to_modification, copy=deepcopy), fixed_modifications)) elif isinstance(fixed_modifications, dict): extra_rules = [] for tag, targets in fixed_modifications.items(): seen.clear() for target in targets: if isinstance(target, str): target = PositionModifierTag(target) if target in seen: continue seen.add(target) tag = _coerce_string_to_modification(tag, copy=deepcopy) extra_rules.append(tag | target) template.fixed_modifications.extend(extra_rules) else: raise TypeError( f"Expected fixed_modifications to be a list or a dict, got {type(fixed_modifications)}" ) return template.proteoforms( include_unmodified=include_unmodified, include_labile=include_labile, strip=strip, deepcopy=deepcopy, ) proteoforms = peptidoforms class ProteoformCombinator: """ Generate combinations of modification (co)localizations for modifications that aren't at a fixed position specified in the original sequence. Attributes ---------- template: :class:`ProForma` The template sequence to apply any combination of rules to variable_rules: list[:class:`GeneratorModificationRuleDirective`] The rules to apply in combinations to the template sequence. include_unmodified : :class:`bool` For all non-fixed modifications, include the case where the modification is not included anywhere. This is equivalent to how variable modification rules are applied in search engines. It still respects the number of copies of modifications included in the input. See :attr:`expand_rules`. include_labile : :class:`bool` For all labile modifications, include the case where the modification is localized at every possible location. .. note :: Along with all possible localizations of the labile modifications, the output will contain peptidoform(s) where it is kept as a labile modification without localization. strip : :class:`bool` If :const:`True`, the generated peptidoforms will have all modification tags stripped of any extra information, leaving only the bare modification definition. deepcopy : :class:`bool` If :const:`True`, the combinator will deepcopy the template for every generated proteoform, which is safer but more computationally expensive. If :const:`False`, the combinator will attempt to reuse the same template and apply modifications in-place, which is faster but may lead to unintended side effects if the generated proteoforms are modified after generation. Defaults to :const:`False`. """ template: ProForma include_unmodified: bool include_labile: bool variable_rules: List[GeneratorModificationRuleDirective] def __init__(self, base_proteoform: ProForma, include_unmodified: bool=False, include_labile: bool=False, strip: bool=False, deepcopy: bool=False): self.deepcopy = deepcopy if deepcopy: self.template = base_proteoform.copy() else: self.template = ProForma(base_proteoform.sequence.copy(), base_proteoform.properties.copy()) self.include_unmodified = include_unmodified self.include_labile = include_labile self.strip = strip self.variable_rules = [] self._extract_rules() self._apply_fixed_modifications() self._iter = self.generate() def _apply_fixed_modifications(self): for c in self.template.fixed_modifications: rule = GeneratorModificationRuleDirective(c) positions = rule.find_positions(self.template) for i in positions: (aa, tags) = self.template[i] if not tags: tags = [] tag = rule.create() if isinstance(tag, (MassModification, ModificationBase)): tag._generated = ModificationSourceType.Constant tags.append(tag) self.template[i] = (aa, tags) self.template.fixed_modifications.clear() def _extract_rules(self) -> None: rules = [] remains = [] for iv in self.template.intervals: block = GeneratorModificationRuleDirective.from_region_rule(iv) if block: rules.extend(block) iv = iv.copy() iv.tags = [t for t in iv.tags if not t.is_modification()] remains.append(iv) else: remains.append(iv) self.template.intervals = remains remains = [] for rule in self.template.unlocalized_modifications: rule_ = GeneratorModificationRuleDirective.from_unlocalized_rule(rule, strip=self.strip) if rule_: rules.append(rule_) else: remains.append(rule) self.template.unlocalized_modifications = remains for (_, tags) in self.template: if tags: tmp = [] for tag in tags: if tag.group_id and tag.is_modification(): rule_ = GeneratorModificationRuleDirective.from_tagged_modification(tag, strip=self.strip) if rule_: rules.append(rule_) tmp.append(PositionLabelTag(group_id=tag.group_id)) else: tmp.append(tag) tags[:] = tmp if self.include_labile: remains = [] for rule in self.template.labile_modifications: rule_ = GeneratorModificationRuleDirective.from_labile_rule(rule, strip=self.strip) if rule_: rules.append(rule_) else: remains.append(rule) self.template.labile_modifications = remains self.variable_rules = rules def __iter__(self): return self def __next__(self): return next(self._iter) def _build_position_map(self) -> List[List[Optional[int]]]: position_choices = [] for rule in self.variable_rules: positions_for = rule.find_positions(self.template) if rule.labile or self.include_unmodified or not positions_for: positions_for = [None] + positions_for position_choices.append(positions_for) return position_choices def _build_modification_iter(self) -> Iterator[Iterator[Tuple[Optional[int], GeneratorModificationRuleDirective]]]: position_choices = self._build_position_map() return map(lambda pos: zip(pos, self.variable_rules), itertools.product(*position_choices)) def generate(self): seen = set() for slots in self._build_modification_iter(): state = Counter() if self.deepcopy: template = self.template.copy() else: template = ProForma(self.template.sequence.copy(), self.template.properties.copy()) valid = True labile_remaining = [] for idx, rule in slots: if rule is None: continue if idx is None: if rule.labile: state[((None, rule.token))] += 1 labile_remaining.append(rule.create()) continue if not rule._can_apply_with(template.sequence[idx][1]): valid = False break (aa, tags) = template[idx] if not tags: tags = [] tag = rule.create() tag._generated = ModificationSourceType.Generated tags.append(tag) template[idx] = (aa, tags) state[((idx, rule.token))] += 1 if valid: state = frozenset(state.items()) if state in seen: continue else: seen.add(state) if labile_remaining: template.labile_modifications = labile_remaining yield template

Contents