Pyteomics documentation v4.7.1

pyteomics.mztab

Contents

Source code for pyteomics.mztab

"""
mztab - mzTab file reader
=========================

Summary
-------

`mzTab <https://github.com/HUPO-PSI/mzTab>`_  is one of the standards
developed by the Proteomics Informatics working group of the HUPO Proteomics
Standard Initiative.

This module provides a way to read mzTab files into a collection of
:py:class:`pandas.DataFrame` instances in memory, along with a mapping
of the file-level metadata. MzTab specifications 1.0 and 2.0 are supported.

Data access
-----------

  :py:class:`MzTab` - a class representing a single mzTab file.

Helpers
-------

    :py:class:`Group` - a collection of metadata relating to one entity.


Internals
---------

    :py:class:`_MzTabTable` - a single table in an mzTab file.


Property Management
~~~~~~~~~~~~~~~~~~~

:mod:`mztab` uses metaprogramming to generate its metadata accessors, generated by
these classes working in concert.

    :py:class:`MetadataBackedProperty`

    :py:class:`MetadataBackedCollection`

    :py:class:`MetadataPropertyAnnotator`

-------------------------------------------------------------------------------
"""

import re
import warnings

try:
    import pandas as pd
except ImportError:
    pd = None


from collections import OrderedDict

from pyteomics.auxiliary import _file_obj
from pyteomics.auxiliary import cvstr
from pyteomics.auxiliary.utils import add_metaclass


def _require_pandas():
    if pd is None:
        raise ImportError(
            "To load an mzTab file into pandas.DataFrame objects, you must install pandas!")


[docs] class MetadataBackedProperty(object): '''Our descriptor type which uses the instance's metadata attribute to carry its values'''
[docs] def __init__(self, name, variant_required=None): if variant_required is None: variant_required = () self.name = name self.variant_required = variant_required self.__doc__ = self.build_docstring()
def __repr__(self): return "{self.__class__.__name__}(name={self.name!r}, variant_required={self.variant_required})".format(self=self) def __get__(self, obj, objtype=None): if obj is None and objtype is not None: # So the property can be seen for what it is return self value = obj.metadata.get(self.name) if value is None and self.variant_required and obj.variant in self.variant_required: raise AttributeError("{0} is missing from a mzTab-\"{1}\" document where it is required!".format( self.name, obj.variant)) return value def __set__(self, obj, value): obj.metadata[self.name] = value def __delete__(self, obj): del obj.metadata[self.name] def build_docstring(self): doc = '''Accesses the {self.name!r} key in the :attr:`metadata` mapping attached to this object. ''' if self.variant_required: if len(self.variant_required) > 1: plural = 's' else: plural = '' requires = ' or '.join(['-%s' % v for v in self.variant_required]) doc += ''' This key must be present when the file is of {requires} variant{plural}. '''.format(requires=requires, plural=plural) doc += ''' Returns ------- object ''' doc = doc.format(self=self) return doc
[docs] class MetadataBackedCollection(object):
[docs] def __init__(self, name, variant_required=None): if variant_required is None: variant_required = () self.name = name self.variant_required = variant_required self.__doc__ = self.build_docstring()
def __get__(self, obj, objtype=None): if obj is None and objtype is not None: # So the property can be seen for what it is return self groups = obj.gather(obj.metadata) value = groups.get(self.name) if value is None and self.variant_required and obj.variant in self.variant_required: raise AttributeError("{0} is missing from a mzTab-\"{1}\" document where it is required!".format( self.name, obj.variant)) return value def build_docstring(self): doc = '''Accesses the {self.name!r} key group gathered in the :attr:`metadata` mapping attached to this object. This group is dynamically generated on each access and may be expensive for repeated use. ''' if self.variant_required: if len(self.variant_required) > 1: plural = 's' else: plural = '' requires = ' or '.join(['-%s' % v for v in self.variant_required]) doc += ''' This key must be present when the file is of {requires} variant{plural}. '''.format(requires=requires, plural=plural) doc += ''' Returns ------- :class:`~.Group` ''' doc = doc.format(self=self) return doc
[docs] class MetadataPropertyAnnotator(type): '''A simple metaclass to do some class-creation time introspection and descriptor binding. Uses a list of strings or 3-tuples from :attr:`__metadata_properties__` to bind :class:`MetadataBackedProperty` or :class:`MetadataBackedCollection` onto the class during its creation. The specification for a property is a tuple of three values: 1. The metadata key to fetch 2. The property name to expose on the object 3. The variant(s) which require this metadata key be present :obj:`("mzTab-version", "version", ("M", "P"))` would be interpreted as Expose a property "version" on instances which serves the key "mzTab-version" from the instance's :attr:`metadata`, and raise an error if it is absent in the "M" or "P" variants. Alternatively a specification may be a single string which will be interpreted as the metadata key, and used to generate the property name replacing all '-' with '_' and assumed to be optional in all variants. If a metadata key ends with "[]" the property is assumed to be a collection. mzTab makes heavy use of "<collection_name>[<index>]..." keys to define groups of homogenous object types, often with per-element attributes. .. code-block:: variable_mod[1] CHEMMOD:15.9949146221 variable_mod[1]-site M variable_mod[1]-position Anywhere variable_mod[2] CHEMMOD:42.0105646863 variable_mod[2]-site N-term variable_mod[2]-position Protein N-term A specification :obj:`("variable_mod[]", "variable_mods", ())` would create a property that returns: .. code-block:: python >>>instance.variable_mods Group([(1, {'name': 'CHEMMOD:15.9949146221', 'position': 'Anywhere', 'site': 'M'}), (2, {'name': 'CHEMMOD:42.0105646863', 'position': 'Protein N-term', 'site': 'N-term'})]) For precise description of the property collection algorithm, see :meth:`~_MzTabParserBase.collapse_properties` and :meth:`~_MzTabParserBase.gather`. If any base classes have a :attr:`__metadata_properties__` attribute, it will also be included unless :attr:`__inherit_metadata_properties__` is set to :const:`False`. Any names explicitly set by the current class override this automatic property generation. ''' def __new__(mcls, name, bases, attrs): props = attrs.get('__metadata_properties__', []) inherit_props = attrs.get("__inherit_metadata_properties__", True) # Gather from parent classes so we can use inheritance for overriding this # behavior too. if inherit_props: for base in bases: props.extend(getattr(base, '__metadata_properties__', [])) keys = set(attrs) # Iterate in reverse to ensure that classes nearer to the new classes override # more basal classes, ending with the new class to make sure overrides are # applied. for prop in reversed(props): # If the property definition is a single string, interpret the specification # as the property name, and apply some simple normalization to make it a valid # Python attribute name and assume the property is always optional. if isinstance(prop, str): prop_name = prop attr_name = prop_name.replace("mzTab-", '').replace('-', '_') variant_required = None else: # Otherwise unpack the triple prop_name, attr_name, variant_required = prop # Attach the new descriptor to the class definition to be created. These descriptors # will then be used when instances of that class try to get/set those attribute names. if attr_name in keys: continue if prop_name.endswith('[]'): # If the property name ends with "[]", then we're dealing with a collection so # use the :class:`MetadataBackedCollection` descriptor attrs[attr_name] = MetadataBackedCollection( prop_name[:-2], variant_required=variant_required) else: # Otherwise it is a scalar-valued property, using the :class:`MetadataBackedProperty` # descriptor prop = attrs[attr_name] = MetadataBackedProperty( prop_name, variant_required=variant_required) return super(MetadataPropertyAnnotator, mcls).__new__(mcls, name, bases, attrs)
class _MzTabParserBase(object): def _parse_param(self, tuplet): """Parse a controlled vocabulary or user specified parameter tuplet into a Python object Parameters ---------- tuplet : str A square brace enclosed tuplet of values describing the parameter Returns ------- tuple The reduced representation of the parameter """ cv, acc, name, value = re.split(r"\s*,\s*", tuplet[1:-1]) param_name = cvstr(name, acc) if value: return (param_name, value) else: return (param_name) def collapse_properties(self, proplist): '''Collapse a flat property list into a hierchical structure. This is intended to operate on :py:class:`Mapping` objects, including :class:`dict`, :class:`pandas.Series` and :class:`pandas.DataFrame`. .. code-block:: python { "ms_run[1]-format": "Andromeda:apl file format", "ms_run[1]-location": "file://...", "ms_run[1]-id_format": "scan number only nativeID format" } to .. code-block:: python { "ms_run": [ { "format": "Andromeda:apl file format", "location": "file://...", "id_format": "scan number only nativeID format" } ] } Parameters ---------- proplist: :class:`Mapping` Key-Value pairs to collapse Returns ------- :class:`OrderedDict`: The collapsed property list ''' entities = OrderedDict() rest = {} for key, value in proplist.items(): try: entity, prop_name = key.rsplit("-", 1) except ValueError: rest[key] = value continue try: entity_dict = entities[entity] except KeyError: entity_dict = entities[entity] = {} entity_dict[prop_name] = value for key, value in proplist.items(): if key in entities: entity = entities[key] if 'name' not in entity: entity['name'] = value for key, value in rest.items(): if key in entities: entities[key]['name'] = value else: entities[key] = value return entities def _collapse_collections(self, entities): gathered = Group() for key, props in entities.items(): if '[' in key: k, ix = key.split('[', 1) if '[' in ix: # If we have multiple [ in a key, we are dealing with a path path = extract_path(key) for k, ix in path[:-1]: store = gathered[k] store = store[int(ix)] k, ix = path[-1] store[k][int(ix)] = props else: ix = int(ix[:-1]) gathered[k][ix] = props else: gathered[key] = props return gathered def _cast_value(self, value): """Convert a cell value to the appropriate Python type Parameters ---------- value : str The cell value as text Returns ------- object The most specialized type recognized """ if value == 'null': return None # is it a parameter? if value.startswith("["): try: if "|" in value: return [self._cast_value(v) for v in value.split("|")] else: return self._parse_param(value) except ValueError: return value else: # begin guessing dtype try: value = int(value) except ValueError: try: value = float(value) except ValueError: pass return value def gather(self, mapping): '''Collapse property lists using :meth:`collapse_properties` and then gather collections of entites into lists. Parameters ---------- mapping : dict The flattened hierarchy of properties to re-construct Returns ------- Group : A :class:`Group` of all entities and collections of entities ''' return self._collapse_collections(self.collapse_properties(mapping))
[docs] class _MzTabTable(_MzTabParserBase): """An internal class for accumulating information about an single table represented in an mzTab file Attributes ---------- header : list The column names for the table name : str The table's name, human readable rows : list An accumulator of table rows """
[docs] def __init__(self, name, header=None, rows=None): if rows is None: rows = [] self.name = name self.header = header self.rows = rows
def __repr__(self): n_cols = len(self.header) if self.header is not None else 0 n_rows = len(self.rows) template = "<_MzTabTable {name} with {n_cols} columns and {n_rows} rows>" return template.format(n_cols=n_cols, n_rows=n_rows, name=self.name) def add(self, row): self.rows.append([self._cast_value(v) for v in row]) def __len__(self): return len(self.rows) def __getitem__(self, i): if isinstance(i, int): return self.gather({h: r for h, r in zip(self.header, self.rows[i])}) elif isinstance(i, slice): out = [] for i in range(i.start or 0, i.stop or len(self), i.step or 1): out.append(self[i]) return out raise TypeError("Cannot access table with object of type %r" % type(i)) def as_dict(self): return {"rows": [dict(zip(self.header, row)) for row in self.rows], "name": self.name}
[docs] def as_df(self, index=None): """Convert the table to a DataFrame in memory. Returns ------- pd.DataFrame """ _require_pandas() table = pd.DataFrame(data=self.rows, columns=self.header) if index is not None and len(table.index) > 0: table = table.set_index(index, drop=False) table.name = self.name return table
def clear(self): self.header = None self.rows = []
DATA_FRAME_FORMAT = 'df' DICT_FORMAT = 'dict' RAW_FORMAT = 'raw' PATH_PARSER = re.compile(r"([^\[]+)\[(\d+)\]_?")
[docs] def extract_path(path): '''Parse `key[index]_next_key[next_index]...` sequences into lists of (key, index) pairs. Parameters ---------- path : str The path key to parse Returns ------- list ''' return [(t, int(i)) for t, i in PATH_PARSER.findall(path)]
[docs] class Group(OrderedDict): '''A type for holding collections of arbitrarily nested keys from rows and metadata mappings. Implemented as an autovivifying :class:`OrderedDict` variant. As such implements the :class:`~collections.abc.Mapping` interface. '''
[docs] def get_path(self, path, default=None): '''As :meth:`get` but over a path key parsed with :func:`extract_path`. Parameters ---------- path : str The path to search down default : object, optional The return value when the path is missing Returns ------- object ''' tokens = extract_path(path) if not tokens: return self.get(path, default) layer = self for k, i in tokens[:-1]: i = int(i) layer = layer.get(k) if layer is None: return None layer = layer.get(i) if layer is None: return None k, i = tokens[-1] i = int(i) layer = layer.get(k) if layer is None: return default value = layer.get(i, default) return value
def __missing__(self, key): value = self.__class__() self[key] = value return value
[docs] @add_metaclass(MetadataPropertyAnnotator) class MzTab(_MzTabParserBase): """Parser for mzTab format files. Attributes ---------- comments : list A list of comments across the file file : _file_obj A file stream wrapper for the file to be read metadata : OrderedDict A mapping of metadata that was entities. peptide_table : _MzTabTable or pd.DataFrame The table of peptides. Not commonly used. protein_table : _MzTabTable or pd.DataFrame The table of protein identifications. small_molecule_table : _MzTabTable or pd.DataFrame The table of small molecule identifications. spectrum_match_table : _MzTabTable or pd.DataFrame The table of spectrum-to-peptide match identifications. table_format: 'df', 'dict', or callable The structure type to replace each table with. The string 'df' will use pd.DataFrame instances. 'dict' will create a dictionary of dictionaries for each table. A callable will be called on each raw _MzTabTable object Additional components of :attr:`metadata` are exposed as properties, returning single values or aggregated collections of objects. """ __metadata_properties__ = [ ('mzTab-version', 'version', ()), ('mzTab-mode', 'mode', 'P'), ('mzTab-type', 'type', 'P'), ('mzTab-ID', 'id', 'M'), 'title', 'description', ('ms_run[]', 'ms_runs', 'MP'), ('instrument[]', 'instruments', ()), ('software[]', 'software', ()), ('publication[]', 'publications', ()), ('contact[]', 'contacts', ()), ('uri[]', 'uris', ()), ('external_study_uri[]', 'external_study_uris', ()), ('quantification_method', 'quantification_method', 'M'), ('sample[]', 'samples', ()), ('assay[]', 'assays', ()), ('study_variable[]', 'study_variables', 'M'), ('custom[]', 'custom', ()), ('cv[]', 'cvs', 'M'), ('database[]', 'databases', 'M'), ('psm_search_engine_score[]', 'psm_search_engine_scores', ()), ('protein_search_engine_score[]', 'protein_search_engine_scores', ()), ('fixed_mod[]', 'fixed_mods', 'P'), ('variable_mod[]', 'variable_mods', 'P'), 'colunit_protein', 'colunit_peptide', 'colunit_psm', 'colunit_small_molecule', 'false_discovery_rate', ('derivatization_agent[]', 'derivatization_agents', ()), ('small_molecule-quantification_unit', 'small_molecule_quantification_unit', 'M'), ('small_molecule_feature-quantification_unit', 'small_molecule_feature_quantification_unit', 'M'), ('small_molecule-identification_reliability', 'small_molecule_identification_reliability', ()), ('id_confidence_measure[]', 'id_confidence_measures', 'M'), ('colunit-small_molecule', 'colunit_small_molecule', ()), ('colunit-small_molecule_feature', 'colunit_small_molecule_feature', ()), ('colunit-small_molecule_evidence', 'colunit_small_molecule_evidence', ()), ('sample_processing[]', 'sample_processing', ()) ]
[docs] def __init__(self, path, encoding='utf8', table_format=DATA_FRAME_FORMAT): if table_format == DATA_FRAME_FORMAT: _require_pandas() # Must be defined in order for metadata properties to work self.variant = None self.file = _file_obj(path, mode='r', encoding=encoding) self.metadata = OrderedDict() self.comments = [] self._table_format = table_format self._init_tables() self._parse() self._determine_schema_version() self._transform_tables()
@property def table_format(self): return self._table_format def __getitem__(self, key): key = key.lower().strip() if key in ('psm', ): return self.spectrum_match_table if key in ('pep', ): return self.peptide_table if key in ('prt', ): return self.protein_table if key in ('sml', ): return self.small_molecule_table if key in ('smf', ): return self.small_molecule_feature_table if key in ('sme', ): return self.small_molecule_evidence_table else: raise KeyError(key) def __iter__(self): if self.variant == "P": yield 'PRT', self.protein_table yield 'PEP', self.peptide_table yield 'PSM', self.spectrum_match_table yield 'SML', self.small_molecule_table elif self.variant == "M": yield 'SML', self.small_molecule_table yield 'SMF', self.small_molecule_feature_table yield 'SME', self.small_molecule_evidence_table def _init_tables(self): self.protein_table = _MzTabTable("protein") self.peptide_table = _MzTabTable("peptide") self.spectrum_match_table = _MzTabTable('psm') self.small_molecule_table = _MzTabTable('small molecule') self.small_molecule_feature_table = _MzTabTable('small molecule feature') self.small_molecule_evidence_table = _MzTabTable('small molecule evidence') def _transform_tables(self): if self._table_format == DATA_FRAME_FORMAT: self.protein_table = self.protein_table.as_df('accession') self.peptide_table = self.peptide_table.as_df() self.spectrum_match_table = self.spectrum_match_table.as_df('PSM_ID') self.small_molecule_table = self.small_molecule_table.as_df() self.small_molecule_feature_table = self.small_molecule_feature_table.as_df() self.small_molecule_evidence_table = self.small_molecule_evidence_table.as_df() elif self._table_format in (DICT_FORMAT, dict): self.protein_table = self.protein_table.as_dict() self.peptide_table = self.peptide_table.as_dict() self.spectrum_match_table = self.spectrum_match_table.as_dict() self.small_molecule_table = self.small_molecule_table.as_dict() self.small_molecule_feature_table = self.small_molecule_feature_table.as_dict() self.small_molecule_evidence_table = self.small_molecule_evidence_table.as_dict() elif callable(self._table_format): self.protein_table = self._table_format(self.protein_table) self.peptide_table = self._table_format(self.peptide_table) self.spectrum_match_table = self._table_format(self.spectrum_match_table) self.small_molecule_table = self._table_format(self.small_molecule_table) self.small_molecule_feature_table = self._table_format(self.small_molecule_feature_table) self.small_molecule_evidence_table = self._table_format(self.small_molecule_evidence_table) def _parse(self): for i, line in enumerate(self.file): line = line.strip() tokens = line.split("\t") if not tokens: continue if tokens[0] == ("MTD"): name = tokens[1] value = self._cast_value(tokens[2]) self.metadata[name] = value elif tokens[0] == 'COM': self.comments.append(self._cast_value(tokens[1])) # headers elif tokens[0] == "PRH": self.protein_table.header = tokens[1:] elif tokens[0] == "PEH": self.peptide_table.header = tokens[1:] elif tokens[0] == "PSH": self.spectrum_match_table.header = tokens[1:] elif tokens[0] == "SMH": self.small_molecule_table.header = tokens[1:] elif tokens[0] == "SFH": self.small_molecule_feature_table.header = tokens[1:] elif tokens[0] == "SEH": self.small_molecule_evidence_table.header = tokens[1:] # rows elif tokens[0] == "PRT": self.protein_table.add(tokens[1:]) elif tokens[0] == "PEP": self.peptide_table.add(tokens[1:]) elif tokens[0] == "PSM": self.spectrum_match_table.add(tokens[1:]) elif tokens[0] == "SML": self.small_molecule_table.add(tokens[1:]) elif tokens[0] == "SMF": self.small_molecule_feature_table.add(tokens[1:]) elif tokens[0] == "SME": self.small_molecule_evidence_table.add(tokens[1:]) def _determine_schema_version(self): if self.version is not None: version = str(self.version) else: warnings.warn("The mzTab-version metadata header was missing. Assuming the schema version is 1.0.0") version = "1.0.0" self.version = version match = re.search(r"(?P<schema_version>\d+(?:\.\d+(?:\.\d+)?)?)(?:-(?P<schema_variant>[MP]))?", version) if match is None: warnings.warn("mzTab-version does not match the expected pattern: %r" % version) version_parsed = '1.0.0' variant = 'P' else: version_parsed, variant = match.groups() if variant is None: variant = "P" self.num_version = [int(v) for v in version_parsed.split(".")] # Ensure self.num_version is 3-tuple while len(self.num_version) < 3: self.num_version.append(0) self.variant = variant def keys(self): return OrderedDict(list(self)).keys() def values(self): return OrderedDict(list(self)).values() def items(self): return OrderedDict(list(self)).items()

Contents