Source code for biocutils.split

from functools import singledispatch
from typing import Any, Sequence, Union

import numpy

from .Factor import Factor
from .get_height import get_height
from .match import match
from .NamedList import NamedList
from .subset import subset



[docs]
@singledispatch
def split(
    x: Any,
    f: Sequence,
    skip: Union[set, Sequence] = [None, numpy.ma.masked],
    drop: bool = False,
    as_NamedList: bool = False,
) -> Union[dict, NamedList]:
    """
    Split a sequence ``x`` into groups defined by a categorical factor ``f``.

    Args:
        x:
            Values to be divided into groups.
            Any object that supports :py:func:`~biocutils.subset.subset` can be used here.

        f:
            A sequence of categorical variables defining the groupings.
            This should have length equal to the "height" of ``x`` (see :py:func:`~biocutils.get_height.get_height`).

            The order of groups is defined by sorting all unique variables in ``f``.
            If a :py:class:`~biocutils.Factor.Factor` is provided, the order of groups is defined by the existing levels.

        skip:
            Values of ``f`` to be skipped.
            The corresponding entries of ``x`` are also omitted from the output.

        drop:
            Whether to drop unused levels, if ``f`` is a ``Factor``.

        as_NamedList:
            Whether to return the results as a :py:class:`~biocutils.NamedList.NamedList`.
            This automatically converts all groups into strings.

    Returns:
        A dictionary where each key is a unique group and each value contains that group's entries from ``x``.
        If ``as_NamedList = true``, this is a ``NamedList`` instead.

    Examples:
        >>> import numpy
        >>> x = numpy.random.rand(
        ...     10
        ... )
        >>> f = numpy.random.choice(
        ...     ["A", "B", "C"],
        ...     10,
        ... )
        >>> import biocutils
        >>> biocutils.split(
        ...     x, f
        ... )
        >>> biocutils.split(
        ...     x,
        ...     f,
        ...     as_NamedList=True,
        ... )
        >>> biocutils.split(
        ...     x,
        ...     biocutils.Factor.from_sequence(
        ...         f,
        ...         [
        ...             "X",
        ...             "A",
        ...             "Y",
        ...             "B",
        ...             "Z",
        ...             "C",
        ...         ],
        ...     ),
        ...     drop=False,
        ... )
    """

    if isinstance(f, Factor):
        if drop:
            f = f.drop_unused_levels()
        if len(skip) > 0:
            levels = []
            reindex = []
            for lev in f.get_levels():
                ix = -1
                if lev not in skip:
                    ix = len(levels)
                    levels.append(lev)
                reindex.append(ix)
            indices = []
            for code in f.get_codes():
                if code >= 0:
                    code = reindex[code]
                indices.append(code)
        else:
            levels = f.get_levels()
            indices = f.get_codes()
    else:
        if len(skip) > 0:
            levels = set()
            for y in f:
                if y not in skip:
                    levels.add(y)
        else:
            levels = set(f)
        levels = sorted(list(levels))
        indices = match(f, levels)

    if get_height(x) != get_height(f):
        raise ValueError("heights of 'x' and 'f' should be the same")

    collected = []
    for lev in levels:
        collected.append([])
    for i, j in enumerate(indices):
        if j >= 0:
            collected[j].append(i)
    for i, c in enumerate(collected):
        collected[i] = subset(x, c)

    if as_NamedList:
        return NamedList(collected, levels)
    else:
        return dict(zip(levels, collected))