Pyteomics documentation v4.1.3dev1

pyteomics.peff

Contents

Source code for pyteomics.peff

"""
peff - PSI Extended FASTA Format
================================

PEFF is a forth-coming standard from PSI-HUPO formalizing and extending the
encoding of protein features and annotations for building search spaces for
proteomics. See `The PEFF specification <http://www.psidev.info/peff>`_ for
more up-to-date information on the standard.

Data manipulation
-----------------

Classes
.......

The PEFF parser inherits several properties from implementation in the :mod:`~.fasta` module,
building on top of the :class:`~.TwoLayerIndexedFASTA` reader.

Available classes:

  :py:class:`IndexedPEFF` - Parse a PEFF format file in binary-mode, supporting
  direct indexing by header string or by tag.

"""

#   Copyright 2018 Joshua Klein, Lev Levitsky
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import re
try:
    from collections.abc import Sequence as SequenceABC, Mapping
except ImportError:
    from collections import Sequence as SequenceABC, Mapping
from collections import OrderedDict, defaultdict

from .fasta import TwoLayerIndexedFASTA





[docs]class IndexedPEFF(TwoLayerIndexedFASTA): """Creates an :py:class:`IndexedPEFF` object. Parameters ---------- source : str or file The file to read. If a file object, it needs to be in *rb* mode. parse : bool, optional Defines whether the descriptions should be parsed in the produced tuples. Default is :py:const:`True`. kwargs : passed to the :py:class:`TwoLayerIndexedFASTA` constructor. """ kv_pattern = re.compile(r"\\(?P<key>\S+)=(?P<value>.+?)(?:\s(?=\\)|$)") header_pattern = re.compile(r"^>?(\S+):(\S+)") has_feature_index = re.compile(r"^\(?(\d+):") header_group = 2 class _PEFFFeature(SequenceABC): def __init__(self, *fields, **kwargs): self.fields = tuple(fields) self.id = kwargs.get('id') self.feature_type = kwargs.get("feature_type") def __eq__(self, other): return tuple(self) == tuple(other) def __ne__(self, other): return not (self == other) def __getitem__(self, i): return self.fields[i] def __len__(self): return len(self.fields) def __repr__(self): return repr(tuple(self)) def __str__(self): return "(%s%s)" % ( '%r:' % self.id if self.id is not None else '', '|'.join(map(str, self)), )
[docs] def __init__(self, source, ignore_comments=False, **kwargs): super(IndexedPEFF, self).__init__( source, ignore_comments=ignore_comments, parser=self.parser, header_pattern=self.header_pattern, **kwargs) self.header_blocks = [] self.comments = [] self.version = None self.number_of_entries = 0 self._parse_header()
def _parse_header(self): self.seek(0) line = self.readline().decode("ascii") if not line.startswith("# PEFF"): raise ValueError("Not a PEFF File") self.version = tuple(map(int, line.strip()[7:].split("."))) current_block = defaultdict(list) in_header = True while in_header: line = self.readline().decode("ascii") if not line.startswith("#"): in_header = False line = line.strip()[2:] if '=' in line: key, value = line.split("=", 1) if key == "GeneralComment": self.comments.append(value) else: current_block[key].append(value) if line.startswith("//"): if current_block: self.header_blocks.append( Header(OrderedDict((k, v if len(v) > 1 else v[0]) for k, v in current_block.items()))) current_block = defaultdict(list) number_of_entries = 0 for block in self.header_blocks: try: number_of_entries += int(block['NumberOfEntries']) except KeyError: pass self.number_of_entries = number_of_entries def _extract_parenthesis_list(self, text): chunks = [] chunk = [] paren_level = 0 i = 0 n = len(text) while i < n: c = text[i] i += 1 if c == "(": if paren_level > 0: chunk.append(c) paren_level += 1 elif c == ")": if paren_level > 1: chunk.append(c) paren_level -= 1 if paren_level == 0: if chunk: chunks.append(chunk) chunk = [] else: chunk.append(c) chunks = list(map(''.join, chunks)) return chunks def _split_pipe_separated_tuple(self, text): parts = text.split("|") return parts def _coerce_types(self, key, value): value = value.strip() feature_id_match = self.has_feature_index.search(value) if feature_id_match: feature_id = int(feature_id_match.group(1)) value = self.has_feature_index.sub('', value) else: feature_id = None if "|" in value: value = self._split_pipe_separated_tuple(value) result = [] for i, v in enumerate(value): result.append(self._coerce_value(key, v, i)) return self._PEFFFeature(*result, feature_type=key, id=feature_id) else: return self._coerce_value(key, value, 0) def _coerce_value(self, key, value, index): try: return int(value) except ValueError: pass try: return float(value) except ValueError: pass return str(value) def parser(self, line): match = self.header_pattern.match(line) if not match: raise ValueError( "Failed to parse {!r} using {!r}".format( line, self)) storage = OrderedDict() prefix = None db_uid = None if line.startswith(">"): line = line[1:] prefix, line = line.split(":", 1) db_uid, line = line.split(" ", 1) storage['Prefix'] = prefix storage['Tag'] = db_uid kv_pattern = re.compile(r"\\(?P<key>\S+)=(?P<value>.+?)(?:\s(?=\\)|$)") for key, value in kv_pattern.findall(line): if not (value.startswith("(") or " (" in value): storage[key] = self._coerce_types(key, value) else: # multi-value storage[key] = [self._coerce_types(key, v) for v in self._extract_parenthesis_list(value)] return Header(storage)

Contents