Source code for biocutils.split

from functools import singledispatch
from typing import Any, Sequence, Union

import numpy

from .Factor import Factor
from .get_height import get_height
from .match import match
from .NamedList import NamedList
from .subset import subset


[docs] @singledispatch def split( x: Any, f: Sequence, skip: Union[set, Sequence] = [None, numpy.ma.masked], drop: bool = False, as_NamedList: bool = False, ) -> Union[dict, NamedList]: """ Split a sequence ``x`` into groups defined by a categorical factor ``f``. Args: x: Values to be divided into groups. Any object that supports :py:func:`~biocutils.subset.subset` can be used here. f: A sequence of categorical variables defining the groupings. This should have length equal to the "height" of ``x`` (see :py:func:`~biocutils.get_height.get_height`). The order of groups is defined by sorting all unique variables in ``f``. If a :py:class:`~biocutils.Factor.Factor` is provided, the order of groups is defined by the existing levels. skip: Values of ``f`` to be skipped. The corresponding entries of ``x`` are also omitted from the output. drop: Whether to drop unused levels, if ``f`` is a ``Factor``. as_NamedList: Whether to return the results as a :py:class:`~biocutils.NamedList.NamedList`. This automatically converts all groups into strings. Returns: A dictionary where each key is a unique group and each value contains that group's entries from ``x``. If ``as_NamedList = true``, this is a ``NamedList`` instead. Examples: >>> import numpy >>> x = numpy.random.rand( ... 10 ... ) >>> f = numpy.random.choice( ... ["A", "B", "C"], ... 10, ... ) >>> import biocutils >>> biocutils.split( ... x, f ... ) >>> biocutils.split( ... x, ... f, ... as_NamedList=True, ... ) >>> biocutils.split( ... x, ... biocutils.Factor.from_sequence( ... f, ... [ ... "X", ... "A", ... "Y", ... "B", ... "Z", ... "C", ... ], ... ), ... drop=False, ... ) """ if isinstance(f, Factor): if drop: f = f.drop_unused_levels() if len(skip) > 0: levels = [] reindex = [] for lev in f.get_levels(): ix = -1 if lev not in skip: ix = len(levels) levels.append(lev) reindex.append(ix) indices = [] for code in f.get_codes(): if code >= 0: code = reindex[code] indices.append(code) else: levels = f.get_levels() indices = f.get_codes() else: if len(skip) > 0: levels = set() for y in f: if y not in skip: levels.add(y) else: levels = set(f) levels = sorted(list(levels)) indices = match(f, levels) if get_height(x) != get_height(f): raise ValueError("heights of 'x' and 'f' should be the same") collected = [] for lev in levels: collected.append([]) for i, j in enumerate(indices): if j >= 0: collected[j].append(i) for i, c in enumerate(collected): collected[i] = subset(x, c) if as_NamedList: return NamedList(collected, levels) else: return dict(zip(levels, collected))