from copy import copy, deepcopy
from typing import Optional, Sequence, Union
import numpy
import warnings
from .assign_sequence import assign_sequence
from .combine_sequences import combine_sequences
from .factorize import factorize
from .is_list_of_type import is_list_of_type
from .is_missing_scalar import is_missing_scalar
from .match import match
from .Names import Names, _combine_names, _name_to_position, _sanitize_names
from .normalize_subscript import (
NormalizedSubscript,
SubscriptTypes,
normalize_subscript,
)
from .print_truncated import print_truncated_list
from .StringList import StringList
from .subset_sequence import subset_sequence
def _sanitize_codes(codes: Sequence[int], num_levels: int) -> numpy.ndarray:
if not isinstance(codes, numpy.ndarray):
replacement = numpy.ndarray(
len(codes), dtype=numpy.min_scalar_type(-num_levels)
) # get a signed type.
for i, x in enumerate(codes):
if is_missing_scalar(x) or x < 0:
replacement[i] = -1
else:
replacement[i] = x
codes = replacement
else:
if len(codes.shape) != 1:
raise ValueError("'codes' should be a 1-dimensional array")
if not numpy.issubdtype(
codes.dtype, numpy.signedinteger
): # force it to be signed.
codes = codes.astype(numpy.min_scalar_type(-num_levels))
for x in codes:
if x < -1 or x >= num_levels:
raise ValueError(
"all entries of 'codes' should refer to an entry of 'levels'"
)
return codes
def _sanitize_levels(levels: Sequence[str], check: bool = True) -> StringList:
if not isinstance(levels, StringList):
levels = StringList(levels)
if levels.get_names() is not None:
levels = levels.set_names(None)
if check:
if any(x is None for x in levels):
raise TypeError("all entries of 'levels' should be non-missing")
if len(set(levels)) < len(levels):
raise ValueError("all entries of 'levels' should be unique")
return levels
[docs]
class FactorIterator:
"""Iterator for a :py:class:`~Factor` object."""
def __init__(self, parent: "Factor"):
"""
Args:
parent: The parent :py:class:`~Factor` object.
"""
self._parent = parent
self._position = 0
[docs]
def __iter__(self) -> "FactorIterator":
"""
Returns:
The iterator.
"""
return self
[docs]
def __next__(self) -> Union[str, None]:
"""
Returns:
Level corresponding to the code at the current position, or None
for missing codes.
"""
if self._position >= len(self._parent):
raise StopIteration
else:
val = self._parent.get_value(self._position)
self._position += 1
return val
[docs]
class Factor:
"""Factor class, equivalent to R's ``factor``.
This is a vector of integer codes, each of which is an index into a list of
unique strings. The aim is to encode a list of strings as integers for
easier numerical analysis.
"""
def __init__(
self,
codes: Sequence[int],
levels: Sequence[str],
ordered: bool = False,
names: Optional[Names] = None,
_validate: bool = True,
):
"""Initialize a Factor object.
Args:
codes:
Sequence of codes. Each valid code should be a non-negative
integer that refers to an entry ``levels``. Codes may be
negative or correspond to a missing scalar (as defined by
:py:meth:`~biocutils.is_missing_scalar.is_missing_scalar`),
in which case they are assumed to represent missing values.
levels:
List of levels containing unique strings.
ordered:
Whether the levels are ordered.
names:
List of names. This should have same length as ``codes``.
Alternatively None, if the factor has no names yet.
_validate:
Internal use only.
"""
if _validate:
levels = _sanitize_levels(levels)
codes = _sanitize_codes(codes, len(levels))
names = _sanitize_names(names, len(codes))
self._codes = codes
self._levels = levels
self._ordered = bool(ordered)
self._names = names
##################################
#####>>>> Simple getters <<<<#####
##################################
def _define_output(self, in_place: bool) -> "Factor":
if in_place:
return self
else:
return copy(self)
[docs]
def get_codes(self) -> numpy.ndarray:
"""
Returns:
Array of integer codes, used as indices into the levels from
:py:meth:`~get_levels`. Missing values are marked with -1.
This should be treated as a read-only reference. To modify
the codes, use :py:meth:`~set_codes` instead.
"""
return self._codes
@property
def codes(self) -> numpy.ndarray:
"""Alias for :py:meth:`~get_codes`."""
return self.get_codes()
[docs]
def set_codes(self, codes: Sequence[int], in_place: bool = False) -> "Factor":
"""
Args:
codes:
Integer codes referencing the factor levels. This should
have the same length as the current object.
in_place:
Whether to modify this object in-place.
Returns:
A modified ``Factor`` object with the new codes, either as a
new object or as a reference to the current object.
"""
output = self._define_output(in_place)
if len(codes) != len(self):
raise ValueError(
"length of 'codes' should be equal to that of the current object"
)
output._codes = _sanitize_codes(codes, len(self._levels))
return output
[docs]
def get_levels(self) -> StringList:
"""
Returns:
List of strings containing the factor levels.
This should be treated as a read-only reference. To modify the
levels, use :py:meth:`~replace_levels` instead.
"""
return self._levels
@property
def levels(self) -> StringList:
"""Alias for :py:meth:`~get_levels`."""
return self.get_levels()
[docs]
def get_ordered(self) -> bool:
"""
Returns:
True if the levels are ordered, otherwise False.
"""
return self._ordered
@property
def ordered(self) -> bool:
"""Alias for :py:meth:`~get_ordered`."""
return self.get_ordered()
[docs]
def set_ordered(self, ordered: bool, in_place: bool = False) -> "Factor":
"""
Args:
ordered:
Whether to treat the levels as being ordered.
in_place:
Whether to modify this object in-place.
Returns:
A modified ``Factor`` object with the new ordered status, either as
a new object or as a reference to the current object.
"""
output = self._define_output(in_place)
output._ordered = bool(ordered)
return output
[docs]
def get_names(self) -> Names:
"""
Returns:
Names for the factor elements.
This should be treated as a read-only reference. To modify the
names, use :py:meth:`~set_names` instead.
"""
return self._names
@property
def names(self) -> Names:
"""Alias for :py:meth:`~get_names`."""
return self.get_names()
[docs]
def set_names(self, names: Optional[Names], in_place: bool = False) -> "NamedList":
"""
Args:
names:
List of names, of the same length as this list.
in_place:
Whether to perform this modification in-place.
Returns:
A modified ``Factor`` with the new names, either as a new object or
as a reference to the current object.
"""
output = self._define_output(in_place)
output._names = _sanitize_names(names, len(self))
return output
#################################
#####>>>> Miscellaneous <<<<#####
#################################
[docs]
def __len__(self) -> int:
"""
Returns:
Length of the factor in terms of the number of codes.
"""
return len(self._codes)
[docs]
def __iter__(self) -> FactorIterator:
"""
Returns:
An iterator over the factor. This will iterate over the codes and
report the corresponding level (or None).
"""
return FactorIterator(self)
[docs]
def __repr__(self) -> str:
"""
Returns:
A stringified representation of this object.
"""
tmp = (
"Factor(codes="
+ print_truncated_list(self._codes)
+ ", levels="
+ print_truncated_list(self._levels)
)
if self._ordered:
tmp += ", ordered=True"
if self._names:
tmp += ", names=" + print_truncated_list(self._names)
tmp += ")"
return tmp
def __str__(self) -> str:
"""
Returns:
A pretty-printed representation of this object.
"""
message = (
"Factor of length "
+ str(len(self._codes))
+ " with "
+ str(len(self._levels))
+ " level"
)
if len(self._levels) != 0:
message += "s"
message += "\n"
message += (
"values: "
+ print_truncated_list(
self._codes, transform=lambda i: self._levels[i], include_brackets=False
)
+ "\n"
)
if self._names is not None:
message += (
"names: "
+ print_truncated_list(
self._names, transform=lambda x: x, include_brackets=False
)
+ "\n"
)
message += (
"levels: "
+ print_truncated_list(
self._levels, transform=lambda x: x, include_brackets=False
)
+ "\n"
)
message += "ordered: " + str(self._ordered)
return message
[docs]
def __eq__(self, other: "Factor"):
"""
Args:
other: Another ``Factor``.
Returns:
Whether the current object is equal to ``other``, i.e.,
same codes, levels, names and ordered status.
"""
if not isinstance(other, Factor):
return False
if len(self) != len(other) or self._levels != other._levels or self._names != other._names or self._ordered != other._ordered:
return False
return (self._codes == other._codes).all()
###########################
#####>>>> Slicing <<<<#####
###########################
[docs]
def get_value(self, index: Union[str, int]) -> Union[str, None]:
"""
Args:
index:
Integer index of the element to obtain. Alternatively, a string
containing the name of the element, using the first occurrence
if duplicate names are present.
Returns:
The factor level for the code at the specified position, or None if
the entry is missing.
"""
if isinstance(index, str):
index = _name_to_position(self._names, index)
i = self._codes[index]
if i < 0:
return None
return self._levels[i]
[docs]
def get_slice(self, index: SubscriptTypes) -> "Factor":
"""
Args:
index:
Subset of elements to obtain, see
:py:func:`~biocutils.normalize_subscript.normalize_subscript`
for details. Strings are matched to names in the current
object, using the first occurrence if duplicate names are
present. Scalars are treated as length-1 sequences.
Returns:
A ``Factor`` is returned containing the specified subset.
"""
index, scalar = normalize_subscript(index, len(self), self._names)
output = copy(self)
output._codes = self._codes[index]
if output._names is not None:
output._names = subset_sequence(self._names, index)
return output
[docs]
def __getitem__(self, index: SubscriptTypes) -> Union[str, "Factor"]:
"""
If ``index`` is a scalar, this is an alias for :py:meth:`~get_value`.
If ``index`` is a sequence, this is an alias for :py:meth:`~get_slice`.
"""
index, scalar = normalize_subscript(index, len(self), self._names)
if scalar:
return self.get_value(index[0])
else:
return self.get_slice(NormalizedSubscript(index))
[docs]
def set_value(
self, index: Union[str, int], value: Union[str, None], in_place: bool = False
) -> "Factor":
"""
Args:
index:
Integer index of the element to replace. Alternatively, a string
containing the name of the element, using the first occurrence
if duplicate names are present.
value:
Replacement value. This should be a string corresponding to a
factor level, or None if missing.
in_place:
Whether to perform the modification in place.
Returns:
A ``Factor`` object with the modified entry at ``index``. This is either
a new object or a reference to the current object.
"""
if in_place:
output = self
else:
output = copy(self)
output._codes = copy(self._codes)
if isinstance(index, str):
index = _name_to_position(self._names, index)
if value is None:
output._codes[index] = -1
return output
for i, l in enumerate(output._levels):
if l == value:
output._codes[index] = i
return output
raise IndexError("failed to find level '" + str(value) + "'")
[docs]
def set_slice(self, index: SubscriptTypes, value: "Factor", in_place: bool = False):
"""
Replace items in the ``Factor`` list. The ``index`` elements in the
current object are replaced with the corresponding values in ``value``.
This is performed by finding the level for each entry of the
replacement ``value``, matching it to a level in the current object,
and replacing the entry of ``codes`` with the code of the matched
level. If there is no matching level, a missing value is inserted.
Args:
index:
Subset of elements to replace, see
:py:func:`~biocutils.normalize_subscript.normalize_subscript`
for details. Strings are matched to names in the current
object, using the first occurrence if duplicate names are
present. Scalars are treated as length-1 sequences.
value:
A ``Factor`` of the same length containing the replacement values.
in_place:
Whether the replacement should be performed in place.
Returns:
A ``Factor`` object with values at ``index`` replaced by ``value``.
This is either a new object or a reference to the current object,
depending on ``in_place``.
"""
if in_place:
output = self
else:
output = copy(self)
output._codes = copy(self._codes)
new_codes = output._codes
index, scalar = normalize_subscript(index, len(self), self._names)
if self._levels == value._levels:
for i, x in enumerate(index):
new_codes[x] = value._codes[i]
else:
mapping = match(value._levels, self._levels)
for i, x in enumerate(index):
v = value._codes[i]
if v >= 0:
new_codes[x] = mapping[v]
else:
new_codes[x] = -1
return output
[docs]
def __setitem__(self, index: SubscriptTypes, value: Union[str, "Factor"]):
"""
If ``index`` is a scalar, this is an alias for :py:meth:`~set_value`.
If ``index`` is a sequence, this is an alias for :py:meth:`~set_slice`.
"""
index, scalar = normalize_subscript(index, len(self), self._names)
if scalar:
self.set_value(index, value, in_place=True)
else:
self.set_slice(NormalizedSubscript(index), value, in_place=True)
#################################
#####>>>> Level setting <<<<#####
#################################
[docs]
def drop_unused_levels(self, in_place: bool = False) -> "Factor":
"""Drop unused levels.
Args:
in_place: Whether to perform this modification in-place.
Returns:
If ``in_place = False``, returns same type as caller (a new ``Factor`` object)
where all unused levels have been removed.
If ``in_place = True``, unused levels are removed from the
current object; a reference to the current object is returned.
"""
if in_place:
output = self
else:
output = copy(self)
output._codes = copy(self._codes)
in_use = [False] * len(self._levels)
for x in self._codes:
if x >= 0:
in_use[x] = True
new_levels = StringList([])
reindex = [-1] * len(in_use)
for i, x in enumerate(in_use):
if x:
reindex[i] = len(new_levels)
new_levels.append(self._levels[i])
new_codes = output._codes
for i, x in enumerate(self._codes):
if x >= 0:
new_codes[i] = reindex[x]
output._levels = new_levels
return output
[docs]
def replace_levels(
self,
levels: Sequence[str],
in_place: bool = False,
) -> "Factor":
"""Replace the existing levels with a new list. The codes of the
returned ``Factor`` are unchanged by this method and will index into
the replacement ``levels``, so each element of the ``Factor`` may refer
to a different string after the levels are replaced. (To change the
levels while ensuring that each element of the ``Factor`` refers to the
same string, use :py:meth:`~remap_levels`. instead.)
Args:
levels:
A sequence of replacement levels. These should be unique
strings with no missing values. The length of this sequence
should be no less than the current number of levels.
in_place:
Whether to perform this modification in-place.
Returns:
If ``in_place = False``, returns same type as caller (a new
``Factor`` object) where the levels have been replaced. Codes
are unchanged and may refer to different strings.
If ``in_place = True``, the levels are replaced in the current
object, and a reference to the current object is returned.
"""
new_levels = levels
if not isinstance(new_levels, StringList):
new_levels = StringList(levels)
if len(new_levels) < len(self._levels):
raise ValueError("'levels' should be at least as long as the existing levels")
present = set()
for x in new_levels:
if x is None:
raise ValueError("all entries of 'levels' should be non-missing")
if x in present:
raise ValueError("all entries of 'levels' should be unique")
present.add(x)
if in_place:
output = self
else:
output = copy(self)
output._levels = new_levels
return output
[docs]
def set_levels(
self,
levels: Union[str, Sequence[str]],
remap: bool = True,
in_place: bool = False
) -> "Factor":
"""
Alias for :py:meth:`~remap_levels` if ``remap = True``, otherwise an
alias for :py:meth:`~replace_levels`. The first alias is deprecated and
:py:meth:`~remap_levels` should be used directly if that is the intent.
"""
if remap:
warnings.warn("'remap=True' is deprecated, use 'remap_levels()' instead", category=DeprecationWarning)
return self.remap_levels(levels, in_place=in_place)
else:
return self.replace_levels(levels, in_place=in_place)
[docs]
def remap_levels(
self, levels: Union[str, Sequence[str]], in_place: bool = False
) -> "Factor":
"""Remap codes to a replacement list of levels. Each entry of the
remapped ``Factor`` will refer to the same string across the old and
new levels, provided that string is present in both sets of levels.
(To change the levels without altering the codes of the ``Factor``, use
:py:meth:`~replace_levels` instead.)
Args:
levels:
A sequence of replacement levels. These should be unique
strings with no missing values.
Alternatively a single string containing an existing level in
this object. The new levels are defined as a permutation of the
existing levels where the provided string is now the first
level. The order of all other levels is preserved.
in_place:
Whether to perform this modification in-place.
Returns:
If ``in_place = False``, returns same type as caller (a new
``Factor`` object) where the levels have been replaced. This will
automatically update the codes so that they still refer to the same
string in the new ``levels``. If a code refers to a level that is
not present in the new ``levels``, it is set to a missing value.
If ``in_place = True``, the levels are replaced in the current
object, and a reference to the current object is returned.
"""
if in_place:
output = self
else:
output = copy(self)
output._codes = copy(self._codes)
lmapping = {}
if isinstance(levels, str):
new_levels = StringList([levels])
for x in self._levels:
if x == levels:
lmapping[x] = 0
else:
lmapping[x] = len(new_levels)
new_levels.append(x)
if levels not in lmapping:
raise ValueError(
"string 'levels' should already be present among object levels"
)
else:
new_levels = levels
if not isinstance(new_levels, StringList):
new_levels = StringList(levels)
for i, x in enumerate(new_levels):
if x is None:
raise ValueError("all entries of 'levels' should be non-missing")
if x in lmapping:
raise ValueError("all entries of 'levels' should be unique")
lmapping[x] = i
mapping = [-1] * len(self._levels)
for i, x in enumerate(self._levels):
if x in lmapping:
mapping[i] = lmapping[x]
new_codes = output._codes
for i, x in enumerate(new_codes):
if x >= 0:
new_codes[i] = mapping[x]
else:
new_codes[i] = -1
output._levels = new_levels
return output
###########################
#####>>>> Copying <<<<#####
###########################
[docs]
def __copy__(self) -> "Factor":
"""
Returns:
A shallow copy of the ``Factor`` object.
"""
return type(self)(
self._codes,
levels=self._levels,
ordered=self._ordered,
names=self._names,
_validate=False,
)
[docs]
def __deepcopy__(self, memo) -> "Factor":
"""
Returns:
A deep copy of the ``Factor`` object.
"""
return type(self)(
deepcopy(self._codes, memo),
levels=deepcopy(self._levels, memo),
ordered=self._ordered,
names=deepcopy(self._names, memo),
_validate=False,
)
#############################
#####>>>> Coercions <<<<#####
#############################
[docs]
def to_pandas(self):
"""Coerce to :py:class:`~pandas.Categorical` object.
Returns:
Categorical: A :py:class:`~pandas.Categorical` object.
"""
from pandas import Categorical
return Categorical(
values=[self._levels[c] for c in self._codes],
ordered=self._ordered,
)
[docs]
@staticmethod
def from_sequence(
x: Sequence[str],
levels: Optional[Sequence[str]] = None,
sort_levels: bool = True,
ordered: bool = False,
names: Optional[Sequence[str]] = None,
**kwargs
) -> "Factor":
"""Convert a sequence of hashable values into a factor.
Args:
x:
A sequence of strings. Any value may be None to indicate
missingness.
levels:
Sequence of reference levels, against which the entries in ``x`` are compared.
If None, this defaults to all unique values of ``x``.
sort_levels:
Whether to sort the automatically-determined levels. If False,
the levels are kept in order of their appearance in ``x``. Not
used if ``levels`` is explicitly supplied.
ordered:
Whether the levels should be assumed to be ordered. Note that
this refers to their importance and has nothing to do with
their sorting order or with the setting of ``sort_levels``.
names:
List of names. This should have same length as ``x``.
Alternatively None, if the factor has no names.
kwargs:
Further arguments to pass to
:py:func:`~biocutils.factorize.factorize`.
Returns:
A ``Factor`` object.
"""
levels, indices = factorize(x, levels=levels, sort_levels=sort_levels, **kwargs)
return Factor(indices, levels=levels, ordered=ordered, names=names)
@subset_sequence.register
def _subset_sequence_Factor(x: Factor, indices: Sequence[int]) -> Factor:
return x.get_slice(NormalizedSubscript(indices))
@assign_sequence.register
def _assign_sequence_Factor(x: Factor, indices: Sequence[int], other: Factor) -> Factor:
return x.set_slice(NormalizedSubscript(indices), other)
@combine_sequences.register(Factor)
def _combine_factors(*x: Factor):
if not is_list_of_type(x, Factor):
raise ValueError("all elements to `combine` must be `Factor` objects")
first = x[0]
first_levels = first._levels
all_same = True
for f in x[1:]:
cur_levels = f._levels
if cur_levels != first_levels or f._ordered != first._ordered:
all_same = False
break
new_codes = []
if all_same:
for f in x:
new_codes.append(f._codes)
new_levels = first._levels
new_ordered = first._ordered
else:
all_levels_map = {}
new_levels = StringList()
for f in x:
mapping = []
for i, y in enumerate(f._levels):
if y not in all_levels_map:
all_levels_map[y] = len(new_levels)
new_levels.append(y)
mapping.append(all_levels_map[y])
curout = numpy.ndarray(
len(f), dtype=numpy.min_scalar_type(-len(new_levels))
)
for i, j in enumerate(f._codes):
if j < 0:
curout[i] = j
else:
curout[i] = mapping[j]
new_codes.append(curout)
new_ordered = False
return type(x[0])(
codes=combine_sequences(*new_codes),
levels=new_levels,
ordered=new_ordered,
names=_combine_names(*x, get_names=lambda x: x.get_names()),
_validate=False,
)