Source code for biocutils.Factor

from copy import copy, deepcopy
from typing import Optional, Sequence, Union

import numpy
import warnings

from .assign_sequence import assign_sequence
from .combine_sequences import combine_sequences
from .factorize import factorize
from .is_list_of_type import is_list_of_type
from .is_missing_scalar import is_missing_scalar
from .match import match
from .Names import Names, _combine_names, _name_to_position, _sanitize_names
from .normalize_subscript import (
    NormalizedSubscript,
    SubscriptTypes,
    normalize_subscript,
)
from .print_truncated import print_truncated_list
from .StringList import StringList
from .subset_sequence import subset_sequence


def _sanitize_codes(codes: Sequence[int], num_levels: int) -> numpy.ndarray:
    if not isinstance(codes, numpy.ndarray):
        replacement = numpy.ndarray(
            len(codes), dtype=numpy.min_scalar_type(-num_levels)
        )  # get a signed type.
        for i, x in enumerate(codes):
            if is_missing_scalar(x) or x < 0:
                replacement[i] = -1
            else:
                replacement[i] = x
        codes = replacement
    else:
        if len(codes.shape) != 1:
            raise ValueError("'codes' should be a 1-dimensional array")
        if not numpy.issubdtype(
            codes.dtype, numpy.signedinteger
        ):  # force it to be signed.
            codes = codes.astype(numpy.min_scalar_type(-num_levels))

    for x in codes:
        if x < -1 or x >= num_levels:
            raise ValueError(
                "all entries of 'codes' should refer to an entry of 'levels'"
            )

    return codes


def _sanitize_levels(levels: Sequence[str], check: bool = True) -> StringList:
    if not isinstance(levels, StringList):
        levels = StringList(levels)
    if levels.get_names() is not None:
        levels = levels.set_names(None)

    if check:
        if any(x is None for x in levels):
            raise TypeError("all entries of 'levels' should be non-missing")
        if len(set(levels)) < len(levels):
            raise ValueError("all entries of 'levels' should be unique")

    return levels


[docs] class FactorIterator: """Iterator for a :py:class:`~Factor` object.""" def __init__(self, parent: "Factor"): """ Args: parent: The parent :py:class:`~Factor` object. """ self._parent = parent self._position = 0
[docs] def __iter__(self) -> "FactorIterator": """ Returns: The iterator. """ return self
[docs] def __next__(self) -> Union[str, None]: """ Returns: Level corresponding to the code at the current position, or None for missing codes. """ if self._position >= len(self._parent): raise StopIteration else: val = self._parent.get_value(self._position) self._position += 1 return val
[docs] class Factor: """Factor class, equivalent to R's ``factor``. This is a vector of integer codes, each of which is an index into a list of unique strings. The aim is to encode a list of strings as integers for easier numerical analysis. """ def __init__( self, codes: Sequence[int], levels: Sequence[str], ordered: bool = False, names: Optional[Names] = None, _validate: bool = True, ): """Initialize a Factor object. Args: codes: Sequence of codes. Each valid code should be a non-negative integer that refers to an entry ``levels``. Codes may be negative or correspond to a missing scalar (as defined by :py:meth:`~biocutils.is_missing_scalar.is_missing_scalar`), in which case they are assumed to represent missing values. levels: List of levels containing unique strings. ordered: Whether the levels are ordered. names: List of names. This should have same length as ``codes``. Alternatively None, if the factor has no names yet. _validate: Internal use only. """ if _validate: levels = _sanitize_levels(levels) codes = _sanitize_codes(codes, len(levels)) names = _sanitize_names(names, len(codes)) self._codes = codes self._levels = levels self._ordered = bool(ordered) self._names = names ################################## #####>>>> Simple getters <<<<##### ################################## def _define_output(self, in_place: bool) -> "Factor": if in_place: return self else: return copy(self)
[docs] def get_codes(self) -> numpy.ndarray: """ Returns: Array of integer codes, used as indices into the levels from :py:meth:`~get_levels`. Missing values are marked with -1. This should be treated as a read-only reference. To modify the codes, use :py:meth:`~set_codes` instead. """ return self._codes
@property def codes(self) -> numpy.ndarray: """Alias for :py:meth:`~get_codes`.""" return self.get_codes()
[docs] def set_codes(self, codes: Sequence[int], in_place: bool = False) -> "Factor": """ Args: codes: Integer codes referencing the factor levels. This should have the same length as the current object. in_place: Whether to modify this object in-place. Returns: A modified ``Factor`` object with the new codes, either as a new object or as a reference to the current object. """ output = self._define_output(in_place) if len(codes) != len(self): raise ValueError( "length of 'codes' should be equal to that of the current object" ) output._codes = _sanitize_codes(codes, len(self._levels)) return output
[docs] def get_levels(self) -> StringList: """ Returns: List of strings containing the factor levels. This should be treated as a read-only reference. To modify the levels, use :py:meth:`~replace_levels` instead. """ return self._levels
@property def levels(self) -> StringList: """Alias for :py:meth:`~get_levels`.""" return self.get_levels()
[docs] def get_ordered(self) -> bool: """ Returns: True if the levels are ordered, otherwise False. """ return self._ordered
@property def ordered(self) -> bool: """Alias for :py:meth:`~get_ordered`.""" return self.get_ordered()
[docs] def set_ordered(self, ordered: bool, in_place: bool = False) -> "Factor": """ Args: ordered: Whether to treat the levels as being ordered. in_place: Whether to modify this object in-place. Returns: A modified ``Factor`` object with the new ordered status, either as a new object or as a reference to the current object. """ output = self._define_output(in_place) output._ordered = bool(ordered) return output
[docs] def get_names(self) -> Names: """ Returns: Names for the factor elements. This should be treated as a read-only reference. To modify the names, use :py:meth:`~set_names` instead. """ return self._names
@property def names(self) -> Names: """Alias for :py:meth:`~get_names`.""" return self.get_names()
[docs] def set_names(self, names: Optional[Names], in_place: bool = False) -> "NamedList": """ Args: names: List of names, of the same length as this list. in_place: Whether to perform this modification in-place. Returns: A modified ``Factor`` with the new names, either as a new object or as a reference to the current object. """ output = self._define_output(in_place) output._names = _sanitize_names(names, len(self)) return output
################################# #####>>>> Miscellaneous <<<<##### #################################
[docs] def __len__(self) -> int: """ Returns: Length of the factor in terms of the number of codes. """ return len(self._codes)
[docs] def __iter__(self) -> FactorIterator: """ Returns: An iterator over the factor. This will iterate over the codes and report the corresponding level (or None). """ return FactorIterator(self)
[docs] def __repr__(self) -> str: """ Returns: A stringified representation of this object. """ tmp = ( "Factor(codes=" + print_truncated_list(self._codes) + ", levels=" + print_truncated_list(self._levels) ) if self._ordered: tmp += ", ordered=True" if self._names: tmp += ", names=" + print_truncated_list(self._names) tmp += ")" return tmp
def __str__(self) -> str: """ Returns: A pretty-printed representation of this object. """ message = ( "Factor of length " + str(len(self._codes)) + " with " + str(len(self._levels)) + " level" ) if len(self._levels) != 0: message += "s" message += "\n" message += ( "values: " + print_truncated_list( self._codes, transform=lambda i: self._levels[i], include_brackets=False ) + "\n" ) if self._names is not None: message += ( "names: " + print_truncated_list( self._names, transform=lambda x: x, include_brackets=False ) + "\n" ) message += ( "levels: " + print_truncated_list( self._levels, transform=lambda x: x, include_brackets=False ) + "\n" ) message += "ordered: " + str(self._ordered) return message
[docs] def __eq__(self, other: "Factor"): """ Args: other: Another ``Factor``. Returns: Whether the current object is equal to ``other``, i.e., same codes, levels, names and ordered status. """ if not isinstance(other, Factor): return False if len(self) != len(other) or self._levels != other._levels or self._names != other._names or self._ordered != other._ordered: return False return (self._codes == other._codes).all()
########################### #####>>>> Slicing <<<<##### ###########################
[docs] def get_value(self, index: Union[str, int]) -> Union[str, None]: """ Args: index: Integer index of the element to obtain. Alternatively, a string containing the name of the element, using the first occurrence if duplicate names are present. Returns: The factor level for the code at the specified position, or None if the entry is missing. """ if isinstance(index, str): index = _name_to_position(self._names, index) i = self._codes[index] if i < 0: return None return self._levels[i]
[docs] def get_slice(self, index: SubscriptTypes) -> "Factor": """ Args: index: Subset of elements to obtain, see :py:func:`~biocutils.normalize_subscript.normalize_subscript` for details. Strings are matched to names in the current object, using the first occurrence if duplicate names are present. Scalars are treated as length-1 sequences. Returns: A ``Factor`` is returned containing the specified subset. """ index, scalar = normalize_subscript(index, len(self), self._names) output = copy(self) output._codes = self._codes[index] if output._names is not None: output._names = subset_sequence(self._names, index) return output
[docs] def __getitem__(self, index: SubscriptTypes) -> Union[str, "Factor"]: """ If ``index`` is a scalar, this is an alias for :py:meth:`~get_value`. If ``index`` is a sequence, this is an alias for :py:meth:`~get_slice`. """ index, scalar = normalize_subscript(index, len(self), self._names) if scalar: return self.get_value(index[0]) else: return self.get_slice(NormalizedSubscript(index))
[docs] def set_value( self, index: Union[str, int], value: Union[str, None], in_place: bool = False ) -> "Factor": """ Args: index: Integer index of the element to replace. Alternatively, a string containing the name of the element, using the first occurrence if duplicate names are present. value: Replacement value. This should be a string corresponding to a factor level, or None if missing. in_place: Whether to perform the modification in place. Returns: A ``Factor`` object with the modified entry at ``index``. This is either a new object or a reference to the current object. """ if in_place: output = self else: output = copy(self) output._codes = copy(self._codes) if isinstance(index, str): index = _name_to_position(self._names, index) if value is None: output._codes[index] = -1 return output for i, l in enumerate(output._levels): if l == value: output._codes[index] = i return output raise IndexError("failed to find level '" + str(value) + "'")
[docs] def set_slice(self, index: SubscriptTypes, value: "Factor", in_place: bool = False): """ Replace items in the ``Factor`` list. The ``index`` elements in the current object are replaced with the corresponding values in ``value``. This is performed by finding the level for each entry of the replacement ``value``, matching it to a level in the current object, and replacing the entry of ``codes`` with the code of the matched level. If there is no matching level, a missing value is inserted. Args: index: Subset of elements to replace, see :py:func:`~biocutils.normalize_subscript.normalize_subscript` for details. Strings are matched to names in the current object, using the first occurrence if duplicate names are present. Scalars are treated as length-1 sequences. value: A ``Factor`` of the same length containing the replacement values. in_place: Whether the replacement should be performed in place. Returns: A ``Factor`` object with values at ``index`` replaced by ``value``. This is either a new object or a reference to the current object, depending on ``in_place``. """ if in_place: output = self else: output = copy(self) output._codes = copy(self._codes) new_codes = output._codes index, scalar = normalize_subscript(index, len(self), self._names) if self._levels == value._levels: for i, x in enumerate(index): new_codes[x] = value._codes[i] else: mapping = match(value._levels, self._levels) for i, x in enumerate(index): v = value._codes[i] if v >= 0: new_codes[x] = mapping[v] else: new_codes[x] = -1 return output
[docs] def __setitem__(self, index: SubscriptTypes, value: Union[str, "Factor"]): """ If ``index`` is a scalar, this is an alias for :py:meth:`~set_value`. If ``index`` is a sequence, this is an alias for :py:meth:`~set_slice`. """ index, scalar = normalize_subscript(index, len(self), self._names) if scalar: self.set_value(index, value, in_place=True) else: self.set_slice(NormalizedSubscript(index), value, in_place=True)
################################# #####>>>> Level setting <<<<##### #################################
[docs] def drop_unused_levels(self, in_place: bool = False) -> "Factor": """Drop unused levels. Args: in_place: Whether to perform this modification in-place. Returns: If ``in_place = False``, returns same type as caller (a new ``Factor`` object) where all unused levels have been removed. If ``in_place = True``, unused levels are removed from the current object; a reference to the current object is returned. """ if in_place: output = self else: output = copy(self) output._codes = copy(self._codes) in_use = [False] * len(self._levels) for x in self._codes: if x >= 0: in_use[x] = True new_levels = StringList([]) reindex = [-1] * len(in_use) for i, x in enumerate(in_use): if x: reindex[i] = len(new_levels) new_levels.append(self._levels[i]) new_codes = output._codes for i, x in enumerate(self._codes): if x >= 0: new_codes[i] = reindex[x] output._levels = new_levels return output
[docs] def replace_levels( self, levels: Sequence[str], in_place: bool = False, ) -> "Factor": """Replace the existing levels with a new list. The codes of the returned ``Factor`` are unchanged by this method and will index into the replacement ``levels``, so each element of the ``Factor`` may refer to a different string after the levels are replaced. (To change the levels while ensuring that each element of the ``Factor`` refers to the same string, use :py:meth:`~remap_levels`. instead.) Args: levels: A sequence of replacement levels. These should be unique strings with no missing values. The length of this sequence should be no less than the current number of levels. in_place: Whether to perform this modification in-place. Returns: If ``in_place = False``, returns same type as caller (a new ``Factor`` object) where the levels have been replaced. Codes are unchanged and may refer to different strings. If ``in_place = True``, the levels are replaced in the current object, and a reference to the current object is returned. """ new_levels = levels if not isinstance(new_levels, StringList): new_levels = StringList(levels) if len(new_levels) < len(self._levels): raise ValueError("'levels' should be at least as long as the existing levels") present = set() for x in new_levels: if x is None: raise ValueError("all entries of 'levels' should be non-missing") if x in present: raise ValueError("all entries of 'levels' should be unique") present.add(x) if in_place: output = self else: output = copy(self) output._levels = new_levels return output
[docs] def set_levels( self, levels: Union[str, Sequence[str]], remap: bool = True, in_place: bool = False ) -> "Factor": """ Alias for :py:meth:`~remap_levels` if ``remap = True``, otherwise an alias for :py:meth:`~replace_levels`. The first alias is deprecated and :py:meth:`~remap_levels` should be used directly if that is the intent. """ if remap: warnings.warn("'remap=True' is deprecated, use 'remap_levels()' instead", category=DeprecationWarning) return self.remap_levels(levels, in_place=in_place) else: return self.replace_levels(levels, in_place=in_place)
[docs] def remap_levels( self, levels: Union[str, Sequence[str]], in_place: bool = False ) -> "Factor": """Remap codes to a replacement list of levels. Each entry of the remapped ``Factor`` will refer to the same string across the old and new levels, provided that string is present in both sets of levels. (To change the levels without altering the codes of the ``Factor``, use :py:meth:`~replace_levels` instead.) Args: levels: A sequence of replacement levels. These should be unique strings with no missing values. Alternatively a single string containing an existing level in this object. The new levels are defined as a permutation of the existing levels where the provided string is now the first level. The order of all other levels is preserved. in_place: Whether to perform this modification in-place. Returns: If ``in_place = False``, returns same type as caller (a new ``Factor`` object) where the levels have been replaced. This will automatically update the codes so that they still refer to the same string in the new ``levels``. If a code refers to a level that is not present in the new ``levels``, it is set to a missing value. If ``in_place = True``, the levels are replaced in the current object, and a reference to the current object is returned. """ if in_place: output = self else: output = copy(self) output._codes = copy(self._codes) lmapping = {} if isinstance(levels, str): new_levels = StringList([levels]) for x in self._levels: if x == levels: lmapping[x] = 0 else: lmapping[x] = len(new_levels) new_levels.append(x) if levels not in lmapping: raise ValueError( "string 'levels' should already be present among object levels" ) else: new_levels = levels if not isinstance(new_levels, StringList): new_levels = StringList(levels) for i, x in enumerate(new_levels): if x is None: raise ValueError("all entries of 'levels' should be non-missing") if x in lmapping: raise ValueError("all entries of 'levels' should be unique") lmapping[x] = i mapping = [-1] * len(self._levels) for i, x in enumerate(self._levels): if x in lmapping: mapping[i] = lmapping[x] new_codes = output._codes for i, x in enumerate(new_codes): if x >= 0: new_codes[i] = mapping[x] else: new_codes[i] = -1 output._levels = new_levels return output
########################### #####>>>> Copying <<<<##### ###########################
[docs] def __copy__(self) -> "Factor": """ Returns: A shallow copy of the ``Factor`` object. """ return type(self)( self._codes, levels=self._levels, ordered=self._ordered, names=self._names, _validate=False, )
[docs] def __deepcopy__(self, memo) -> "Factor": """ Returns: A deep copy of the ``Factor`` object. """ return type(self)( deepcopy(self._codes, memo), levels=deepcopy(self._levels, memo), ordered=self._ordered, names=deepcopy(self._names, memo), _validate=False, )
############################# #####>>>> Coercions <<<<##### #############################
[docs] def to_pandas(self): """Coerce to :py:class:`~pandas.Categorical` object. Returns: Categorical: A :py:class:`~pandas.Categorical` object. """ from pandas import Categorical return Categorical( values=[self._levels[c] for c in self._codes], ordered=self._ordered, )
[docs] @staticmethod def from_sequence( x: Sequence[str], levels: Optional[Sequence[str]] = None, sort_levels: bool = True, ordered: bool = False, names: Optional[Sequence[str]] = None, **kwargs ) -> "Factor": """Convert a sequence of hashable values into a factor. Args: x: A sequence of strings. Any value may be None to indicate missingness. levels: Sequence of reference levels, against which the entries in ``x`` are compared. If None, this defaults to all unique values of ``x``. sort_levels: Whether to sort the automatically-determined levels. If False, the levels are kept in order of their appearance in ``x``. Not used if ``levels`` is explicitly supplied. ordered: Whether the levels should be assumed to be ordered. Note that this refers to their importance and has nothing to do with their sorting order or with the setting of ``sort_levels``. names: List of names. This should have same length as ``x``. Alternatively None, if the factor has no names. kwargs: Further arguments to pass to :py:func:`~biocutils.factorize.factorize`. Returns: A ``Factor`` object. """ levels, indices = factorize(x, levels=levels, sort_levels=sort_levels, **kwargs) return Factor(indices, levels=levels, ordered=ordered, names=names)
@subset_sequence.register def _subset_sequence_Factor(x: Factor, indices: Sequence[int]) -> Factor: return x.get_slice(NormalizedSubscript(indices)) @assign_sequence.register def _assign_sequence_Factor(x: Factor, indices: Sequence[int], other: Factor) -> Factor: return x.set_slice(NormalizedSubscript(indices), other) @combine_sequences.register(Factor) def _combine_factors(*x: Factor): if not is_list_of_type(x, Factor): raise ValueError("all elements to `combine` must be `Factor` objects") first = x[0] first_levels = first._levels all_same = True for f in x[1:]: cur_levels = f._levels if cur_levels != first_levels or f._ordered != first._ordered: all_same = False break new_codes = [] if all_same: for f in x: new_codes.append(f._codes) new_levels = first._levels new_ordered = first._ordered else: all_levels_map = {} new_levels = StringList() for f in x: mapping = [] for i, y in enumerate(f._levels): if y not in all_levels_map: all_levels_map[y] = len(new_levels) new_levels.append(y) mapping.append(all_levels_map[y]) curout = numpy.ndarray( len(f), dtype=numpy.min_scalar_type(-len(new_levels)) ) for i, j in enumerate(f._codes): if j < 0: curout[i] = j else: curout[i] = mapping[j] new_codes.append(curout) new_ordered = False return type(x[0])( codes=combine_sequences(*new_codes), levels=new_levels, ordered=new_ordered, names=_combine_names(*x, get_names=lambda x: x.get_names()), _validate=False, )