Source code for biocutils.normalize_subscript

from typing import Any, Optional, Sequence, Tuple, Union

import numpy


def _raise_int(idx: int, length):
    raise IndexError(
        "subscript ("
        + str(idx)
        + ") out of range for vector-like object of length "
        + str(length)
    )


def _is_scalar_bool(sub):
    return isinstance(sub, bool) or isinstance(sub, numpy.bool_)



[docs]
class NormalizedSubscript:
    """
    Subscript normalized by :py:func:`~normalize_subscript`. This
    is used to indicate that no further normalization is required,
    such that :py:func:`~normalize_subscript` is just a no-op.
    """

    def __init__(self, subscript: Sequence[int]):
        """
        Args:
            subscript:
                Sequence of integers for a normalized subscript.
        """
        self._subscript = subscript

    @property
    def subscript(self) -> Sequence[int]:
        """
        Returns:
            The subscript, as a sequence of integer positions.
        """
        return self._subscript


[docs]
    def __getitem__(self, index: Any) -> Any:
        """
        Args:
            index:
                Any argument accepted by the ``__getitem__`` method of the
                :py:attr:`~subscript`.

        Returns:
            The same return value as the ``__getitem__`` method of the
            subscript. This should be an integer if ``index`` is an integer.
        """
        return self._subscript[index]



[docs]
    def __len__(self) -> int:
        """
        Returns:
            Length of the subscript.
        """
        return len(self._subscript)




SubscriptTypes = Union[slice, range, Sequence, int, str, bool, NormalizedSubscript]



[docs]
def normalize_subscript(
    sub: SubscriptTypes,
    length: int,
    names: Optional[Sequence[str]] = None,
    non_negative_only: bool = True,
) -> Tuple:
    """
    Normalize a subscript for ``__getitem__`` or friends into a sequence of
    integer indices, for consistent downstream use.

    Args:
        sub:
            The subscript. This can be any of the following:

            - A slice.
            - A range containing indices to elements. Negative values are
              allowed. An error is raised if the indices are out of range.
            - A single integer specifying the index of an element. A negative
              value is allowed. An error is raised if the index is out of range.
            - A single string that can be found in ``names``, which is
              converted to the index of the first occurrence of that string in
              ``names``. An error is raised if the string cannot be found.
            - A single boolean, which is converted into a list containing the
              first element if true, and an empty list if false.
            - A sequence of strings, integers and/or booleans. Strings are
              converted to indices based on first occurrence in ``names``,
              as described above. Integers should be indices to an element.
              Each truthy boolean is converted to an index equal to its
              position in ``sub``, and each Falsey boolean is ignored.
            - A :py:class:`~NormalizedSubscript`, in which case the
              ``subscript`` property is directly returned.

        length:
            Length of the object.

        names:
            List of names for each entry in the object. If not None, this
            should have length equal to ``length``. Some optimizations
            are possible if this is a :py:class:`~Names.Names` object.

        non_negative_only:
            Whether negative indices must be converted into non-negative
            equivalents. Setting this to `False` may improve efficiency.

    Returns:
        A tuple containing (i) a sequence of integer indices in ``[0, length)``
        specifying the subscript elements, and (ii) a boolean indicating whether
        ``sub`` was a scalar.
    """
    if isinstance(sub, NormalizedSubscript):
        return sub.subscript, False

    if _is_scalar_bool(sub):  # before ints, as bools are ints.
        if sub:
            return [0], True
        else:
            return [], False

    if isinstance(sub, int) or isinstance(sub, numpy.integer):
        if sub < -length or sub >= length:
            _raise_int(sub, length)
        if sub < 0 and non_negative_only:
            sub += length
        return [int(sub)], True

    if isinstance(sub, str):
        if names is None:
            raise IndexError(
                "failed to find subscript '"
                + sub
                + "' for vector-like object with no names"
            )
        i = -1
        from .Names import Names

        if isinstance(names, Names):
            i = names.map(sub)
        else:
            for j, n in enumerate(names):
                if n == sub:
                    i = j
                    break
        if i < 0:
            raise IndexError("cannot find subscript '" + sub + "' in the names")
        return [i], True

    if isinstance(sub, slice):
        return range(*sub.indices(length)), False
    if isinstance(sub, range):
        if len(sub) == 0:
            return [], False

        first = sub[0]
        last = sub[-1]
        if first >= length:
            _raise_int(first, length)
        if last >= length:
            _raise_int(last, length)
        if first < -length:
            _raise_int(first, length)
        if last < -length:
            _raise_int(last, length)

        if not non_negative_only:
            return sub, False
        else:
            if sub.start < 0:
                if sub.stop < 0:
                    return range(length + sub.start, length + sub.stop, sub.step), False
                else:
                    return [(x < 0) * length + x for x in sub], False
            else:
                if sub.stop < 0:
                    return [(x < 0) * length + x for x in sub], False
                else:
                    return sub, False

    can_return_early = True
    for x in sub:
        if isinstance(x, str) or _is_scalar_bool(x) or (x < 0 and non_negative_only):
            can_return_early = False
            break

    if can_return_early:
        for x in sub:
            if x >= length or x < -length:
                _raise_int(x, length)
        return sub, False

    output = []
    has_strings = set()
    string_positions = []
    from .Names import Names

    are_names_indexed = isinstance(names, Names)

    for i, x in enumerate(sub):
        if isinstance(x, str):
            if are_names_indexed:
                i = names.map(x)
                if i < 0:
                    raise IndexError("cannot find subscript '" + x + "' in the names")
                output.append(i)
            else:
                has_strings.add(x)
                string_positions.append(len(output))
                output.append(None)
        elif _is_scalar_bool(x):
            if x:
                output.append(i)
        elif x < 0:
            if x < -length:
                _raise_int(x, length)
            output.append(int(x) + length)
        else:
            if x >= length:
                _raise_int(x, length)
            output.append(int(x))

    if len(has_strings):
        if names is None:
            raise IndexError(
                "cannot find string subscripts for vector-like object with no names"
            )

        mapping = {}
        for i, y in enumerate(names):
            if y in has_strings:
                mapping[y] = i
                has_strings.remove(y)  # remove it so we only consider the first.

        for i in string_positions:
            output[i] = mapping[sub[i]]

    return output, False