from functools import singledispatch
from typing import Any, Sequence, Union
import numpy
from .Factor import Factor
from .subset import subset
[docs]
@singledispatch
def duplicated(x: Any, incomparables: Union[set, Sequence] = set(), from_last: bool = False) -> numpy.ndarray:
"""
Find duplicated elements of ``x``.
Args:
x:
Object to be searched for duplicates.
This is usually a sequence that can be iterated over.
incomparables:
Values of ``x`` that cannot be compared.
Any value of ``x`` in ``incomparables`` will never be a duplicate.
Any object that has an ``__in__`` method can be used here.
from_last:
Whether to report the last occurrence as a non-duplicate.
Returns:
NumPy array of length equal to that of ``x``,
containing truthy values for only the first occurrence of each value of ``x``.
If ``from_last = True``, truthy values are only reported for the last occurrence of each value of ``x``.
Examples:
>>> import biocutils
>>> biocutils.duplicated(
... [
... 1,
... 2,
... 1,
... 2,
... 3,
... 2,
... ]
... )
>>> biocutils.duplicated(
... [
... 1,
... 2,
... 1,
... 2,
... 3,
... 2,
... ],
... from_last=True,
... )
>>> biocutils.duplicated(
... [
... 1,
... 2,
... None,
... None,
... 3,
... 2,
... ]
... )
>>> biocutils.duplicated(
... [
... 1,
... 2,
... None,
... None,
... 3,
... 2,
... ],
... incomparables=set(
... [None]
... ),
... )
"""
available = set()
output = numpy.ndarray(len(x), dtype=numpy.bool_)
def process(i, y):
if y in incomparables:
output[i] = False
elif y in available:
output[i] = True
else:
available.add(y)
output[i] = False
if not from_last:
for i, y in enumerate(x):
process(i, y)
else:
for i in range(len(x) - 1, -1, -1):
process(i, x[i])
return output
@duplicated.register
def _duplicated_Factor(
x: Factor, incomparables: Union[set, Sequence] = set(), from_last: bool = False
) -> numpy.ndarray:
present = []
for lev in x.get_levels():
if lev in incomparables:
present.append(None)
else:
present.append(False)
# Handling codes of -1, i.e., None.
if None in incomparables:
present.append(None)
else:
present.append(False)
output = numpy.ndarray(len(x), dtype=numpy.bool_)
def process(i, y):
tmp = present[y]
if tmp is None:
output[i] = False
elif tmp:
output[i] = True
else:
present[y] = True
output[i] = False
if not from_last:
for i, y in enumerate(x.get_codes()):
process(i, y)
else:
codes = x.get_codes()
for i in range(len(x) - 1, -1, -1):
process(i, codes[i])
return output
[docs]
def unique(x: Any, incomparables: Union[set, Sequence] = set(), from_last: bool = False) -> Any:
"""
Get all unique values of ``x``.
Args:
x:
Object in which to find unique entries.
This is usually a sequence that can be iterated over.
incomparables:
Values of ``x`` that cannot be compared.
Any value of ``x`` in ``incomparables`` will never be a duplicate.
Any object that has an ``__in__`` method can be used here.
from_last:
Whether to retain the last occurrence of each value in ``x``.
By default, the first occurrence is retained.
Returns:
An object containing unique values of ``x``.
This is usually of the same class as ``x``.
Examples:
>>> import biocutils
>>> biocutils.unique(
... [
... 1,
... 2,
... 1,
... 2,
... 3,
... 2,
... ]
... )
>>> biocutils.unique(
... [
... 1,
... 2,
... None,
... None,
... 3,
... 2,
... ]
... )
>>> biocutils.unique(
... [
... 1,
... 2,
... None,
... None,
... 3,
... 2,
... ],
... incomparables=set(
... [None]
... ),
... )
"""
return subset(x, numpy.where(numpy.logical_not(duplicated(x, incomparables=incomparables, from_last=from_last)))[0])