Source code for biocutils.match

from typing import Sequence, Union, Optional
import numpy

from .map_to_index import DUPLICATE_METHOD, map_to_index


[docs] def match( x: Sequence, targets: Union[dict, Sequence], duplicate_method: DUPLICATE_METHOD = "first", dtype: Optional[numpy.ndarray] = None, fail_missing: Optional[bool] = None, ) -> numpy.ndarray: """Find a matching value of each element of ``x`` in ``target``. Args: x: Sequence of values to match. targets: Sequence of targets to be matched against. Alternatively, a dictionary generated by passing a sequence of targets to :py:meth:`~biocutils.map_to_index.map_to_index`. duplicate_method: How to handle duplicate entries in ``targets``. Matches can be reported to the first or last occurrence of duplicates. dtype: NumPy type of the output array. This should be an integer type; if missing values are expected, the type should be a signed integer. If None, a suitable signed type is automatically determined. fail_missing: Whether to raise an error if ``x`` cannot be found in ``targets``. If ``None``, this defaults to ``True`` if ``dtype`` is an unsigned type, otherwise it defaults to ``False``. Returns: Array of length equal to ``x``, containing the integer position of each entry of ``x`` inside ``target``; or -1, if the entry of ``x`` is None or cannot be found in ``target``. """ if not isinstance(targets, dict): targets = map_to_index(targets, duplicate_method=duplicate_method) if dtype is None: dtype = numpy.min_scalar_type(-len(targets)) # get a signed type indices = numpy.zeros(len(x), dtype=dtype) if fail_missing is None: fail_missing = numpy.issubdtype(dtype, numpy.unsignedinteger) # Separate loops to reduce branching in the tight inner loop. if not fail_missing: for i, y in enumerate(x): if y in targets: indices[i] = targets[y] else: indices[i] = -1 else: for i, y in enumerate(x): if not y in targets: raise ValueError("cannot find '" + str(y) + "' in 'targets'") indices[i] = targets[y] return indices