Source code for biocutils.Factor

from copy import copy, deepcopy
from typing import Optional, Sequence, Union

import numpy
import warnings

from .assign_sequence import assign_sequence
from .combine_sequences import combine_sequences
from .factorize import factorize
from .is_list_of_type import is_list_of_type
from .is_missing_scalar import is_missing_scalar
from .match import match
from .Names import Names, _combine_names, _name_to_position, _sanitize_names
from .normalize_subscript import (
    NormalizedSubscript,
    SubscriptTypes,
    normalize_subscript,
)
from .print_truncated import print_truncated_list
from .StringList import StringList
from .subset_sequence import subset_sequence


def _sanitize_codes(codes: Sequence[int], num_levels: int) -> numpy.ndarray:
    if not isinstance(codes, numpy.ndarray):
        replacement = numpy.ndarray(
            len(codes), dtype=numpy.min_scalar_type(-num_levels)
        )  # get a signed type.
        for i, x in enumerate(codes):
            if is_missing_scalar(x) or x < 0:
                replacement[i] = -1
            else:
                replacement[i] = x
        codes = replacement
    else:
        if len(codes.shape) != 1:
            raise ValueError("'codes' should be a 1-dimensional array")
        if not numpy.issubdtype(
            codes.dtype, numpy.signedinteger
        ):  # force it to be signed.
            codes = codes.astype(numpy.min_scalar_type(-num_levels))

    for x in codes:
        if x < -1 or x >= num_levels:
            raise ValueError(
                "all entries of 'codes' should refer to an entry of 'levels'"
            )

    return codes


def _sanitize_levels(levels: Sequence[str], check: bool = True) -> StringList:
    if not isinstance(levels, StringList):
        levels = StringList(levels)
    if levels.get_names() is not None:
        levels = levels.set_names(None)

    if check:
        if any(x is None for x in levels):
            raise TypeError("all entries of 'levels' should be non-missing")
        if len(set(levels)) < len(levels):
            raise ValueError("all entries of 'levels' should be unique")

    return levels



[docs]
class FactorIterator:
    """Iterator for a :py:class:`~Factor` object."""

    def __init__(self, parent: "Factor"):
        """
        Args:
            parent: The parent :py:class:`~Factor` object.
        """
        self._parent = parent
        self._position = 0


[docs]
    def __iter__(self) -> "FactorIterator":
        """
        Returns:
            The iterator.
        """
        return self



[docs]
    def __next__(self) -> Union[str, None]:
        """
        Returns:
            Level corresponding to the code at the current position, or None
            for missing codes.
        """
        if self._position >= len(self._parent):
            raise StopIteration
        else:
            val = self._parent.get_value(self._position)
            self._position += 1
            return val





[docs]
class Factor:
    """Factor class, equivalent to R's ``factor``.

    This is a vector of integer codes, each of which is an index into a list of
    unique strings. The aim is to encode a list of strings as integers for
    easier numerical analysis.
    """

    def __init__(
        self,
        codes: Sequence[int],
        levels: Sequence[str],
        ordered: bool = False,
        names: Optional[Names] = None,
        _validate: bool = True,
    ):
        """Initialize a Factor object.

        Args:
            codes:
                Sequence of codes. Each valid code should be a non-negative
                integer that refers to an entry ``levels``. Codes may be
                negative or correspond to a missing scalar (as defined by
                :py:meth:`~biocutils.is_missing_scalar.is_missing_scalar`),
                in which case they are assumed to represent missing values.

            levels:
                List of levels containing unique strings.

            ordered:
                Whether the levels are ordered.

            names:
                List of names. This should have same length as ``codes``.
                Alternatively None, if the factor has no names yet.

            _validate:
                Internal use only.
        """
        if _validate:
            levels = _sanitize_levels(levels)
            codes = _sanitize_codes(codes, len(levels))
            names = _sanitize_names(names, len(codes))

        self._codes = codes
        self._levels = levels
        self._ordered = bool(ordered)
        self._names = names

    ##################################
    #####>>>> Simple getters <<<<#####
    ##################################

    def _define_output(self, in_place: bool) -> "Factor":
        if in_place:
            return self
        else:
            return copy(self)


[docs]
    def get_codes(self) -> numpy.ndarray:
        """
        Returns:
            Array of integer codes, used as indices into the levels from
            :py:meth:`~get_levels`. Missing values are marked with -1.

            This should be treated as a read-only reference. To modify
            the codes, use :py:meth:`~set_codes` instead.
        """
        return self._codes


    @property
    def codes(self) -> numpy.ndarray:
        """Alias for :py:meth:`~get_codes`."""
        return self.get_codes()


[docs]
    def set_codes(self, codes: Sequence[int], in_place: bool = False) -> "Factor":
        """
        Args:
            codes:
                Integer codes referencing the factor levels. This should
                have the same length as the current object.

            in_place:
                Whether to modify this object in-place.

        Returns:
            A modified ``Factor`` object with the new codes, either as a
            new object or as a reference to the current object.
        """
        output = self._define_output(in_place)
        if len(codes) != len(self):
            raise ValueError(
                "length of 'codes' should be equal to that of the current object"
            )
        output._codes = _sanitize_codes(codes, len(self._levels))
        return output



[docs]
    def get_levels(self) -> StringList:
        """
        Returns:
            List of strings containing the factor levels.

            This should be treated as a read-only reference. To modify the
            levels, use :py:meth:`~replace_levels` instead.
        """
        return self._levels


    @property
    def levels(self) -> StringList:
        """Alias for :py:meth:`~get_levels`."""
        return self.get_levels()


[docs]
    def get_ordered(self) -> bool:
        """
        Returns:
            True if the levels are ordered, otherwise False.
        """
        return self._ordered


    @property
    def ordered(self) -> bool:
        """Alias for :py:meth:`~get_ordered`."""
        return self.get_ordered()


[docs]
    def set_ordered(self, ordered: bool, in_place: bool = False) -> "Factor":
        """
        Args:
            ordered:
                Whether to treat the levels as being ordered.

            in_place:
                Whether to modify this object in-place.

        Returns:
            A modified ``Factor`` object with the new ordered status, either as
            a new object or as a reference to the current object.
        """
        output = self._define_output(in_place)
        output._ordered = bool(ordered)
        return output



[docs]
    def get_names(self) -> Names:
        """
        Returns:
            Names for the factor elements.

            This should be treated as a read-only reference. To modify the
            names, use :py:meth:`~set_names` instead.
        """
        return self._names


    @property
    def names(self) -> Names:
        """Alias for :py:meth:`~get_names`."""
        return self.get_names()


[docs]
    def set_names(self, names: Optional[Names], in_place: bool = False) -> "NamedList":
        """
        Args:
            names:
                List of names, of the same length as this list.

            in_place:
                Whether to perform this modification in-place.

        Returns:
            A modified ``Factor`` with the new names, either as a new object or
            as a reference to the current object.
        """
        output = self._define_output(in_place)
        output._names = _sanitize_names(names, len(self))
        return output


    #################################
    #####>>>> Miscellaneous <<<<#####
    #################################


[docs]
    def __len__(self) -> int:
        """
        Returns:
            Length of the factor in terms of the number of codes.
        """
        return len(self._codes)



[docs]
    def __iter__(self) -> FactorIterator:
        """
        Returns:
            An iterator over the factor. This will iterate over the codes and
            report the corresponding level (or None).
        """
        return FactorIterator(self)



[docs]
    def __repr__(self) -> str:
        """
        Returns:
            A stringified representation of this object.
        """
        tmp = (
            "Factor(codes="
            + print_truncated_list(self._codes)
            + ", levels="
            + print_truncated_list(self._levels)
        )
        if self._ordered:
            tmp += ", ordered=True"
        if self._names:
            tmp += ", names=" + print_truncated_list(self._names)
        tmp += ")"
        return tmp


    def __str__(self) -> str:
        """
        Returns:
            A pretty-printed representation of this object.
        """
        message = (
            "Factor of length "
            + str(len(self._codes))
            + " with "
            + str(len(self._levels))
            + " level"
        )
        if len(self._levels) != 0:
            message += "s"
        message += "\n"
        message += (
            "values: "
            + print_truncated_list(
                self._codes, transform=lambda i: self._levels[i], include_brackets=False
            )
            + "\n"
        )
        if self._names is not None:
            message += (
                "names: "
                + print_truncated_list(
                    self._names, transform=lambda x: x, include_brackets=False
                )
                + "\n"
            )
        message += (
            "levels: "
            + print_truncated_list(
                self._levels, transform=lambda x: x, include_brackets=False
            )
            + "\n"
        )
        message += "ordered: " + str(self._ordered)
        return message


[docs]
    def __eq__(self, other: "Factor"):
        """
        Args:
            other: Another ``Factor``.

        Returns:
            Whether the current object is equal to ``other``, i.e.,
            same codes, levels, names and ordered status.
        """
        if not isinstance(other, Factor):
            return False
        if len(self) != len(other) or self._levels != other._levels or self._names != other._names or self._ordered != other._ordered:
            return False
        return (self._codes == other._codes).all()


    ###########################
    #####>>>> Slicing <<<<#####
    ###########################


[docs]
    def get_value(self, index: Union[str, int]) -> Union[str, None]:
        """
        Args:
            index:
                Integer index of the element to obtain. Alternatively, a string
                containing the name of the element, using the first occurrence
                if duplicate names are present.

        Returns:
            The factor level for the code at the specified position, or None if
            the entry is missing.
        """
        if isinstance(index, str):
            index = _name_to_position(self._names, index)
        i = self._codes[index]
        if i < 0:
            return None
        return self._levels[i]



[docs]
    def get_slice(self, index: SubscriptTypes) -> "Factor":
        """
        Args:
            index:
                Subset of elements to obtain, see
                :py:func:`~biocutils.normalize_subscript.normalize_subscript`
                for details. Strings are matched to names in the current
                object, using the first occurrence if duplicate names are
                present.  Scalars are treated as length-1 sequences.

        Returns:
            A ``Factor`` is returned containing the specified subset.
        """
        index, scalar = normalize_subscript(index, len(self), self._names)
        output = copy(self)
        output._codes = self._codes[index]
        if output._names is not None:
            output._names = subset_sequence(self._names, index)
        return output



[docs]
    def __getitem__(self, index: SubscriptTypes) -> Union[str, "Factor"]:
        """
        If ``index`` is a scalar, this is an alias for :py:meth:`~get_value`.

        If ``index`` is a sequence, this is an alias for :py:meth:`~get_slice`.
        """
        index, scalar = normalize_subscript(index, len(self), self._names)
        if scalar:
            return self.get_value(index[0])
        else:
            return self.get_slice(NormalizedSubscript(index))



[docs]
    def set_value(
        self, index: Union[str, int], value: Union[str, None], in_place: bool = False
    ) -> "Factor":
        """
        Args:
            index:
                Integer index of the element to replace. Alternatively, a string
                containing the name of the element, using the first occurrence
                if duplicate names are present.

            value:
                Replacement value. This should be a string corresponding to a
                factor level, or None if missing.

            in_place:
                Whether to perform the modification in place.

        Returns:
            A ``Factor`` object with the modified entry at ``index``. This is either
            a new object or a reference to the current object.
        """
        if in_place:
            output = self
        else:
            output = copy(self)
            output._codes = copy(self._codes)

        if isinstance(index, str):
            index = _name_to_position(self._names, index)

        if value is None:
            output._codes[index] = -1
            return output

        for i, l in enumerate(output._levels):
            if l == value:
                output._codes[index] = i
                return output

        raise IndexError("failed to find level '" + str(value) + "'")



[docs]
    def set_slice(self, index: SubscriptTypes, value: "Factor", in_place: bool = False):
        """
        Replace items in the ``Factor`` list.  The ``index`` elements in the
        current object are replaced with the corresponding values in ``value``.
        This is performed by finding the level for each entry of the
        replacement ``value``, matching it to a level in the current object,
        and replacing the entry of ``codes`` with the code of the matched
        level. If there is no matching level, a missing value is inserted.

        Args:
            index:
                Subset of elements to replace, see
                :py:func:`~biocutils.normalize_subscript.normalize_subscript`
                for details. Strings are matched to names in the current
                object, using the first occurrence if duplicate names are
                present. Scalars are treated as length-1 sequences.

            value:
                A ``Factor`` of the same length containing the replacement values.

            in_place:
                Whether the replacement should be performed in place.

        Returns:
            A ``Factor`` object with values at ``index`` replaced by ``value``.
            This is either a new object or a reference to the current object,
            depending on ``in_place``.
        """
        if in_place:
            output = self
        else:
            output = copy(self)
            output._codes = copy(self._codes)

        new_codes = output._codes

        index, scalar = normalize_subscript(index, len(self), self._names)
        if self._levels == value._levels:
            for i, x in enumerate(index):
                new_codes[x] = value._codes[i]
        else:
            mapping = match(value._levels, self._levels)
            for i, x in enumerate(index):
                v = value._codes[i]
                if v >= 0:
                    new_codes[x] = mapping[v]
                else:
                    new_codes[x] = -1

        return output



[docs]
    def __setitem__(self, index: SubscriptTypes, value: Union[str, "Factor"]):
        """
        If ``index`` is a scalar, this is an alias for :py:meth:`~set_value`.

        If ``index`` is a sequence, this is an alias for :py:meth:`~set_slice`.
        """
        index, scalar = normalize_subscript(index, len(self), self._names)
        if scalar:
            self.set_value(index, value, in_place=True)
        else:
            self.set_slice(NormalizedSubscript(index), value, in_place=True)


    #################################
    #####>>>> Level setting <<<<#####
    #################################


[docs]
    def drop_unused_levels(self, in_place: bool = False) -> "Factor":
        """Drop unused levels.

        Args:
            in_place: Whether to perform this modification in-place.

        Returns:
            If ``in_place = False``, returns same type as caller (a new ``Factor`` object)
            where all unused levels have been removed.

            If ``in_place = True``, unused levels are removed from the
            current object; a reference to the current object is returned.
        """
        if in_place:
            output = self
        else:
            output = copy(self)
            output._codes = copy(self._codes)

        in_use = [False] * len(self._levels)
        for x in self._codes:
            if x >= 0:
                in_use[x] = True

        new_levels = StringList([])
        reindex = [-1] * len(in_use)
        for i, x in enumerate(in_use):
            if x:
                reindex[i] = len(new_levels)
                new_levels.append(self._levels[i])

        new_codes = output._codes
        for i, x in enumerate(self._codes):
            if x >= 0:
                new_codes[i] = reindex[x]

        output._levels = new_levels
        return output



[docs]
    def replace_levels(
        self,
        levels: Sequence[str],
        in_place: bool = False,
    ) -> "Factor":
        """Replace the existing levels with a new list. The codes of the
        returned ``Factor`` are unchanged by this method and will index into
        the replacement ``levels``, so each element of the ``Factor`` may refer
        to a different string after the levels are replaced. (To change the
        levels while ensuring that each element of the ``Factor`` refers to the
        same string, use :py:meth:`~remap_levels`.  instead.)

        Args:
            levels:
                A sequence of replacement levels. These should be unique
                strings with no missing values. The length of this sequence
                should be no less than the current number of levels.

            in_place:
                Whether to perform this modification in-place.

        Returns:
            If ``in_place = False``, returns same type as caller (a new
            ``Factor`` object) where the levels have been replaced. Codes
            are unchanged and may refer to different strings.

            If ``in_place = True``, the levels are replaced in the current
            object, and a reference to the current object is returned.
        """
        new_levels = levels
        if not isinstance(new_levels, StringList):
            new_levels = StringList(levels)
        if len(new_levels) < len(self._levels):
            raise ValueError("'levels' should be at least as long as the existing levels")

        present = set()
        for x in new_levels:
            if x is None:
                raise ValueError("all entries of 'levels' should be non-missing")
            if x in present:
                raise ValueError("all entries of 'levels' should be unique")
            present.add(x)

        if in_place:
            output = self
        else:
            output = copy(self)

        output._levels = new_levels
        return output



[docs]
    def set_levels(
        self,
        levels: Union[str, Sequence[str]],
        remap: bool = True,
        in_place: bool = False
    ) -> "Factor":
        """
        Alias for :py:meth:`~remap_levels` if ``remap = True``, otherwise an
        alias for :py:meth:`~replace_levels`. The first alias is deprecated and
        :py:meth:`~remap_levels` should be used directly if that is the intent.
        """
        if remap:
            warnings.warn("'remap=True' is deprecated, use 'remap_levels()' instead", category=DeprecationWarning)
            return self.remap_levels(levels, in_place=in_place)
        else:
            return self.replace_levels(levels, in_place=in_place)



[docs]
    def remap_levels(
        self, levels: Union[str, Sequence[str]], in_place: bool = False
    ) -> "Factor":
        """Remap codes to a replacement list of levels. Each entry of the
        remapped ``Factor`` will refer to the same string across the old and
        new levels, provided that string is present in both sets of levels.
        (To change the levels without altering the codes of the ``Factor``, use
        :py:meth:`~replace_levels` instead.)

        Args:
            levels:
                A sequence of replacement levels. These should be unique
                strings with no missing values.

                Alternatively a single string containing an existing level in
                this object. The new levels are defined as a permutation of the
                existing levels where the provided string is now the first
                level. The order of all other levels is preserved.

            in_place:
                Whether to perform this modification in-place.

        Returns:
            If ``in_place = False``, returns same type as caller (a new
            ``Factor`` object) where the levels have been replaced. This will
            automatically update the codes so that they still refer to the same
            string in the new ``levels``. If a code refers to a level that is
            not present in the new ``levels``, it is set to a missing value.

            If ``in_place = True``, the levels are replaced in the current
            object, and a reference to the current object is returned.
        """
        if in_place:
            output = self
        else:
            output = copy(self)
            output._codes = copy(self._codes)

        lmapping = {}
        if isinstance(levels, str):
            new_levels = StringList([levels])
            for x in self._levels:
                if x == levels:
                    lmapping[x] = 0
                else:
                    lmapping[x] = len(new_levels)
                    new_levels.append(x)
            if levels not in lmapping:
                raise ValueError(
                    "string 'levels' should already be present among object levels"
                )
        else:
            new_levels = levels
            if not isinstance(new_levels, StringList):
                new_levels = StringList(levels)
            for i, x in enumerate(new_levels):
                if x is None:
                    raise ValueError("all entries of 'levels' should be non-missing")
                if x in lmapping:
                    raise ValueError("all entries of 'levels' should be unique")
                lmapping[x] = i

        mapping = [-1] * len(self._levels)
        for i, x in enumerate(self._levels):
            if x in lmapping:
                mapping[i] = lmapping[x]

        new_codes = output._codes
        for i, x in enumerate(new_codes):
            if x >= 0:
                new_codes[i] = mapping[x]
            else:
                new_codes[i] = -1

        output._levels = new_levels
        return output


    ###########################
    #####>>>> Copying <<<<#####
    ###########################


[docs]
    def __copy__(self) -> "Factor":
        """
        Returns:
            A shallow copy of the ``Factor`` object.
        """
        return type(self)(
            self._codes,
            levels=self._levels,
            ordered=self._ordered,
            names=self._names,
            _validate=False,
        )



[docs]
    def __deepcopy__(self, memo) -> "Factor":
        """
        Returns:
            A deep copy of the ``Factor`` object.
        """
        return type(self)(
            deepcopy(self._codes, memo),
            levels=deepcopy(self._levels, memo),
            ordered=self._ordered,
            names=deepcopy(self._names, memo),
            _validate=False,
        )


    #############################
    #####>>>> Coercions <<<<#####
    #############################


[docs]
    def to_pandas(self):
        """Coerce to :py:class:`~pandas.Categorical` object.

        Returns:
            Categorical: A :py:class:`~pandas.Categorical` object.
        """
        from pandas import Categorical

        return Categorical(
            values=[self._levels[c] for c in self._codes],
            ordered=self._ordered,
        )



[docs]
    @staticmethod
    def from_sequence(
        x: Sequence[str],
        levels: Optional[Sequence[str]] = None,
        sort_levels: bool = True,
        ordered: bool = False,
        names: Optional[Sequence[str]] = None,
        **kwargs
    ) -> "Factor":
        """Convert a sequence of hashable values into a factor.

        Args:
            x:
                A sequence of strings. Any value may be None to indicate
                missingness.

            levels:
                Sequence of reference levels, against which the entries in ``x`` are compared.
                If None, this defaults to all unique values of ``x``.

            sort_levels:
                Whether to sort the automatically-determined levels. If False,
                the levels are kept in order of their appearance in ``x``.  Not
                used if ``levels`` is explicitly supplied.

            ordered:
                Whether the levels should be assumed to be ordered.  Note that
                this refers to their importance and has nothing to do with
                their sorting order or with the setting of ``sort_levels``.

            names:
                List of names. This should have same length as ``x``.
                Alternatively None, if the factor has no names.

            kwargs:
                Further arguments to pass to
                :py:func:`~biocutils.factorize.factorize`.

        Returns:
            A ``Factor`` object.
        """
        levels, indices = factorize(x, levels=levels, sort_levels=sort_levels, **kwargs)
        return Factor(indices, levels=levels, ordered=ordered, names=names)




@subset_sequence.register
def _subset_sequence_Factor(x: Factor, indices: Sequence[int]) -> Factor:
    return x.get_slice(NormalizedSubscript(indices))


@assign_sequence.register
def _assign_sequence_Factor(x: Factor, indices: Sequence[int], other: Factor) -> Factor:
    return x.set_slice(NormalizedSubscript(indices), other)


@combine_sequences.register(Factor)
def _combine_factors(*x: Factor):
    if not is_list_of_type(x, Factor):
        raise ValueError("all elements to `combine` must be `Factor` objects")

    first = x[0]
    first_levels = first._levels
    all_same = True
    for f in x[1:]:
        cur_levels = f._levels
        if cur_levels != first_levels or f._ordered != first._ordered:
            all_same = False
            break

    new_codes = []
    if all_same:
        for f in x:
            new_codes.append(f._codes)
        new_levels = first._levels
        new_ordered = first._ordered
    else:
        all_levels_map = {}
        new_levels = StringList()
        for f in x:
            mapping = []
            for i, y in enumerate(f._levels):
                if y not in all_levels_map:
                    all_levels_map[y] = len(new_levels)
                    new_levels.append(y)
                mapping.append(all_levels_map[y])

            curout = numpy.ndarray(
                len(f), dtype=numpy.min_scalar_type(-len(new_levels))
            )
            for i, j in enumerate(f._codes):
                if j < 0:
                    curout[i] = j
                else:
                    curout[i] = mapping[j]
            new_codes.append(curout)
        new_ordered = False

    return type(x[0])(
        codes=combine_sequences(*new_codes),
        levels=new_levels,
        ordered=new_ordered,
        names=_combine_names(*x, get_names=lambda x: x.get_names()),
        _validate=False,
    )