Source code for biocutils.match
from typing import Optional, Sequence, Union
import numpy
from .map_to_index import DUPLICATE_METHOD, map_to_index
[docs]
def match(
x: Sequence,
targets: Union[dict, Sequence],
duplicate_method: DUPLICATE_METHOD = "first",
dtype: Optional[numpy.dtype] = None,
fail_missing: Optional[bool] = None,
) -> numpy.ndarray:
"""Find a matching value of each element of ``x`` in ``target``.
Args:
x:
Sequence of values to match.
targets:
Sequence of targets to be matched against. Alternatively, a
dictionary generated by passing a sequence of targets to
:py:meth:`~biocutils.map_to_index.map_to_index`.
duplicate_method:
How to handle duplicate entries in ``targets``. Matches can
be reported to the first or last occurrence of duplicates.
dtype:
NumPy type of the output array. This should be an integer type; if
missing values are expected, the type should be a signed integer.
If None, a suitable signed type is automatically determined.
fail_missing:
Whether to raise an error if ``x`` cannot be found in ``targets``.
If ``None``, this defaults to ``True`` if ``dtype`` is an unsigned
type, otherwise it defaults to ``False``.
Returns:
Array of length equal to ``x``, containing the integer position of each
entry of ``x`` inside ``target``; or -1, if the entry of ``x`` is
None or cannot be found in ``target``.
"""
if not isinstance(targets, dict):
targets = map_to_index(targets, duplicate_method=duplicate_method)
if dtype is None:
dtype = numpy.min_scalar_type(-len(targets)) # get a signed type
indices = numpy.zeros(len(x), dtype=dtype)
if fail_missing is None:
fail_missing = numpy.issubdtype(dtype, numpy.unsignedinteger)
# Separate loops to reduce branching in the tight inner loop.
if not fail_missing:
for i, y in enumerate(x):
if y in targets:
indices[i] = targets[y]
else:
indices[i] = -1
else:
for i, y in enumerate(x):
if y not in targets:
raise ValueError("cannot find '" + str(y) + "' in 'targets'")
indices[i] = targets[y]
return indices