Source code for biocutils.factorize
from typing import Optional, Sequence, Tuple
import numpy
from .is_missing_scalar import is_missing_scalar
from .match import match
[docs]
def factorize(
x: Sequence,
levels: Optional[Sequence] = None,
sort_levels: bool = False,
dtype: Optional[numpy.dtype] = None,
fail_missing: Optional[bool] = None,
) -> Tuple[list, numpy.ndarray]:
"""Convert a sequence of hashable values into a factor.
Args:
x:
A sequence of hashable values.
Any value may be None to indicate missingness.
levels:
Sequence of reference levels, against which the entries in ``x`` are compared.
If None, this defaults to all unique values of ``x``.
sort_levels:
Whether to sort the automatically-determined levels.
If False, the levels are kept in order of their appearance in ``x``.
Not used if ``levels`` is explicitly supplied.
dtype:
NumPy type of the array of indices, see
:py:func:`~biocutils.match.match` for details.
fail_missing:
Whether to raise an error upon encountering missing levels in
``x``, see :py:func:`~biocutils.match.match` for details.
Returns:
Tuple where the first element is a list of unique levels and the second
element in a NumPy array containing integer codes, i.e., indices into
the first list. Indexing the first list by the second array will
recover ``x``, with the exception of any None or masked values in ``x``
that will instead be represented by -1 in the second array.
"""
if levels is None:
present = set()
levels = []
for val in x:
if not is_missing_scalar(val) and val not in present:
levels.append(val)
present.add(val)
if sort_levels:
levels.sort()
codes = match(x, levels, dtype=dtype, fail_missing=fail_missing)
return levels, codes