Source code for biocutils.match

from functools import singledispatch
from typing import Any, Literal, Optional, Sequence, Union

import numpy


[docs] class MatchIndex: """ An index for matching one or more ``x`` against different ``targets``. This is typically constructed by :py:func:`~create_match_index`. """ def __init__( self, targets: Any, duplicate_method: Literal["first", "last", "any"] = "first", incomparables: Union[set, Sequence] = set(), dtype: Optional[numpy.dtype] = None, fail_missing: Optional[bool] = None, ): """ Args: targets: Targets to be matched against, see :py:func:`~match` for details. duplicate_method: How to handle duplicate entries in ``targets``, see :py:func:`~match` for details. incomparables: Values that cannot be compared, see :py:func:`~match` for details. dtype: NumPy type of the output array, see :py:func:`~match` for details. fail_missing: Whether to raise an error if a value cannot be found in ``targets``, see :py:func:`~match` for details. """ from .Factor import Factor if isinstance(targets, dict): # Back-compatible behavior. import warnings warnings.warn(DeprecationWarning("'map_to_index()' is deprecated, use 'create_match_index()' instead")) self._map = targets elif isinstance(targets, Factor): # Optimized method when both x and targets are factors. target_index = [None] * (len(targets.get_levels()) + 1) # add 1 so that code = -1 still behaves correctly. first_tie = duplicate_method == "first" or duplicate_method == "any" for i, code in enumerate(targets.get_codes()): if not first_tie or target_index[code] is None: target_index[code] = i mapping = {} for i, lev in enumerate(targets.get_levels()): if lev not in incomparables: candidate = target_index[i] if candidate is not None: mapping[lev] = candidate if None not in incomparables: # None matching to another None is still possible. candidate = target_index[-1] if candidate is not None: mapping[None] = target_index[-1] self._map = mapping else: first_tie = duplicate_method == "first" or duplicate_method == "any" mapping = {} for i, val in enumerate(targets): if val not in incomparables: if not first_tie or val not in mapping: mapping[val] = i self._map = mapping if dtype is None: dtype = numpy.min_scalar_type(-len(targets)) # get a signed type self._dtype = dtype if fail_missing is None: fail_missing = numpy.issubdtype(dtype, numpy.unsignedinteger) self._fail_missing = fail_missing
[docs] def match(self, x: Any) -> numpy.ndarray: """ Args: x: Values to match against ``targets``. Returns: NumPy array of length equal to ``x``, containing the integer position of each entry of ``x`` inside ``targets``; see :py:func:`~match` for more details. """ from .Factor import Factor indices = numpy.zeros(len(x), dtype=self._dtype) if not isinstance(x, Factor): # Separate loops to reduce branching in the tight inner loop. if not self._fail_missing: for i, y in enumerate(x): if y in self._map: indices[i] = self._map[y] else: indices[i] = -1 else: for i, y in enumerate(x): if y not in self._map: raise ValueError("cannot find '" + str(y) + "' in 'targets'") indices[i] = self._map[y] else: x_index = [-1] * (len(x.get_levels()) + 1) # adding 1 so that code = -1 still works. for i, lev in enumerate(x.get_levels()): if lev in self._map: x_index[i] = self._map[lev] if None in self._map: x_index[-1] = self._map[None] # Separate loops to reduce branching in the tight inner loop. if self._fail_missing: for i, code in enumerate(x.get_codes()): candidate = x_index[code] if candidate < 0: raise ValueError("cannot find '" + str(x[i]) + "' in 'targets'") indices[i] = candidate else: for i, code in enumerate(x.get_codes()): indices[i] = x_index[code] return indices
[docs] @singledispatch def create_match_index( targets: Any, duplicate_method: Literal["first", "last", "any"] = "first", incomparables: Union[set, Sequence] = set(), dtype: Optional[numpy.dtype] = None, fail_missing: Optional[bool] = None, ) -> MatchIndex: """ Create a index for matching an arbitrary sequence against ``targets``. Calling ``create_match_index(targets, ...).match(x)`` is equivalent to ``match(x, targets, ...)``. Args: targets: Targets to be matched against, see :py:func:`~match` for details. duplicate_method: How to handle duplicate entries in ``targets``, see :py:func:`~match` for details. incomparables: Values that cannot be compared, see :py:func:`~match` for details. dtype: NumPy type of the output array, see :py:func:`~match` for details. fail_missing: Whether to raise an error if a value cannot be found in ``targets``, see :py:func:`~match` for details. Returns: A ``MatchIndex``. Other implementations of ``create_match_index()`` may return any object that has a ``match()`` method. Examples: >>> import biocutils >>> mobj = biocutils.create_match_index( ... [ ... "A", ... "B", ... "C", ... "D", ... ] ... ) >>> mobj.match( ... [ ... "A", ... "B", ... "B", ... "C", ... "C", ... "D", ... "E", ... ] ... ) >>> >>> ft = biocutils.Factor.from_sequence( ... [ ... "a", ... "B", ... "c", ... "D", ... "e", ... "B", ... "D", ... ] ... ) >>> fobj = biocutils.create_match_index( ... ft ... ) >>> fx = biocutils.Factor.from_sequence( ... [ ... "A", ... "B", ... "B", ... "C", ... "C", ... "D", ... "E", ... ] ... ) >>> fobj.match(fx) """ return MatchIndex( targets, duplicate_method=duplicate_method, incomparables=incomparables, dtype=dtype, fail_missing=fail_missing )
[docs] @singledispatch def match( x: Any, targets: Any, duplicate_method: Literal["first", "last", "any"] = "first", incomparables: Union[set, Sequence] = set(), dtype: Optional[numpy.dtype] = None, fail_missing: Optional[bool] = None, ) -> numpy.ndarray: """ Find a matching value of each element of ``x`` in ``targets``. Calling ``match(x, targets, ...)`` should be equivalent to ``create_match_index(targets, ...).match(x)``. Args: x: Values to match against ``targets``. targets: Targets to be matched against. It is not strictly necessary that ``x`` is of the same type as ``targets``, but entries of ``x`` should be capable of being equal to entries of ``x``. duplicate_method: How to handle duplicate entries in ``targets``. Either the first, last or any occurrence of each target is reported. incomparables: Values of ``x`` or ``targets`` that cannot be compared. No match will be reported for any value of ``x`` that is in ``incomparables``. Any object that has an ``__in__`` method can be used here. dtype: NumPy type of the output array. This should be an integer type; if missing values are expected, the type should be a signed integer. If ``None``, a suitable signed type is automatically determined. fail_missing: Whether to raise an error if ``x`` cannot be found in ``targets``. If ``None``, this defaults to ``True`` if ``dtype`` is an unsigned type, otherwise it defaults to ``False``. Returns: NumPy array of length equal to ``x``, containing the integer position of each entry of ``x`` inside ``targets``; or -1, if the entry of ``x`` is ``None`` or cannot be found in ``targets``. Examples: >>> import biocutils >>> biocutils.match( ... [ ... "A", ... "B", ... "B", ... "C", ... "D", ... "D", ... "E", ... ], ... [ ... "A", ... "B", ... "C", ... "D", ... ], ... ) >>> >>> fx = biocutils.Factor.from_sequence( ... [ ... "A", ... "B", ... "B", ... "C", ... "C", ... "D", ... "E", ... ] ... ) >>> ft = biocutils.Factor.from_sequence( ... [ ... "a", ... "B", ... "c", ... "D", ... "e", ... "B", ... "D", ... ] ... ) >>> biocutils.match( ... fx, ... ft, ... duplicate_method="last", ... ) """ obj = create_match_index( targets, duplicate_method=duplicate_method, incomparables=incomparables, dtype=dtype, fail_missing=fail_missing ) return obj.match(x)