Source code for compressed_lists.base

from typing import Any, Callable, Iterator, List, Optional, Sequence, Union
from warnings import warn

import biocutils as ut
import numpy as np

from .partition import Partitioning

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


def _validate_data_and_partitions(unlist_data, partition):
    if len(unlist_data) != partition.nobj():
        raise ValueError(
            f"Length of 'unlist_data' ({len(unlist_data)}) "
            f"doesn't match 'partitioning' total length ({partition.nobj()})."
        )


[docs] class CompressedList: """Base class for compressed list objects. `CompressedList` stores list elements concatenated in a single vector-like object with partitioning information that defines where each list element starts and ends. """
[docs] def __init__( self, unlist_data: Any, partitioning: Partitioning, element_type: Any = None, element_metadata: Optional[dict] = None, metadata: Optional[dict] = None, validate: bool = True, ): """Initialize a CompressedList. Args: unlist_data: Vector-like object containing concatenated elements. partitioning: Partitioning object defining element boundaries (exclusive). element_type: class for the type of elements. element_metadata: Optional metadata for elements. metadata: Optional general metadata. validate: Internal use only. """ self._unlist_data = unlist_data self._partitioning = partitioning self._element_type = element_type self._element_metadata = element_metadata or {} self._metadata = metadata or {} if validate: _validate_data_and_partitions(self._unlist_data, self._partitioning)
def _define_output(self, in_place: bool = False) -> "CompressedList": if in_place is True: return self else: return self.__copy__() ######################### ######>> Copying <<###### #########################
[docs] def __deepcopy__(self, memo=None, _nil=[]): """ Returns: A deep copy of the current ``Partitioning``. """ from copy import deepcopy _unlistdata_copy = deepcopy(self._unlist_data) _part_copy = deepcopy(self._partitioning) _elem_type_copy = deepcopy(self._element_type) _elem_metadata_copy = deepcopy(self._element_metadata) _metadata_copy = deepcopy(self._metadata) current_class_const = type(self) return current_class_const( unlist_data=_unlistdata_copy, partitioning=_part_copy, element_type=_elem_type_copy, element_metadata=_elem_metadata_copy, metadata=_metadata_copy, )
[docs] def __copy__(self): """ Returns: A shallow copy of the current ``Partitioning``. """ current_class_const = type(self) return current_class_const( unlist_data=self._unlist_data, partitioning=self._partitioning, element_type=self._element_type, element_metadata=self._element_metadata, metadata=self._metadata, )
[docs] def copy(self): """Alias for :py:meth:`~__copy__`.""" return self.__copy__()
###################################### ######>> length and iterators <<###### ######################################
[docs] def __len__(self) -> int: """Return the number of list elements.""" return len(self._partitioning)
[docs] def get_element_lengths(self) -> np.ndarray: """Get the lengths of each list element.""" return self._partitioning.get_element_lengths()
[docs] def __iter__(self) -> Iterator[Any]: """Iterate over list elements.""" for i in range(len(self)): yield self[i]
########################## ######>> Printing <<###### ##########################
[docs] def __repr__(self) -> str: """ Returns: A string representation. """ output = f"{type(self).__name__}(number_of_elements={len(self)}" output += ", unlist_data=" + ut.print_truncated_list(self._unlist_data) output += ", partitioning=" + self._partitioning.__repr__() output += ", element_type=" + self._element_type if len(self._element_metadata) > 0: output += ", element_metadata=" + ut.print_truncated_dict(self._element_metadata) if len(self._metadata) > 0: output += ", metadata=" + ut.print_truncated_dict(self._metadata) output += ")" return output
def __str__(self) -> str: """ Returns: A pretty-printed string containing the contents of this object. """ output = f"class: {type(self).__name__}\n" output += f"number of elements: ({len(self)}) of type: {self._element_type}\n" output += f"unlist_data: {ut.print_truncated_list(self._unlist_data)}\n" output += f"partitioning: {ut.print_truncated_list(self._partitioning)}\n" output += f"element_metadata({str(len(self._element_metadata))}): {ut.print_truncated_list(list(self._element_metadata.keys()), sep=' ', include_brackets=False, transform=lambda y: y)}\n" output += f"metadata({str(len(self._metadata))}): {ut.print_truncated_list(list(self._metadata.keys()), sep=' ', include_brackets=False, transform=lambda y: y)}\n" return output ############################# ######>> element_type <<##### #############################
[docs] def get_element_type(self) -> str: """Return the element_type.""" return self._element_type
@property def element_type(self) -> str: """Alias for :py:attr:`~get_element_type`, provided for back-compatibility.""" return self.get_element_type() ########################### ######>> partitions <<##### ###########################
[docs] def get_partitioning(self) -> Partitioning: """Return the paritioning info.""" return self._partitioning
@property def paritioning(self) -> Partitioning: """Alias for :py:attr:`~get_paritioning`, provided for back-compatibility.""" return self.get_partitioning() ####################### ######>> names <<###### #######################
[docs] def get_names(self) -> Optional[ut.NamedList]: """Get the names of list elements.""" return self._partitioning.get_names()
[docs] def set_names(self, names: List[str], in_place: bool = False) -> "CompressedList": """Set the names of list elements. names: New names, same as the number of rows. May be `None` to remove names. in_place: Whether to modify the ``CompressedList`` in place. Returns: A modified ``CompressedList`` object, either as a copy of the original or as a reference to the (in-place-modified) original. """ output = self._define_output(in_place) output._partitioning = self._partitioning.set_names(names, in_place=False) return output
@property def names(self) -> Optional[ut.NamedList]: """Alias for :py:attr:`~get_names`.""" return self._partitioning.get_names() @names.setter def names(self, names: Sequence[str]): """Alias for :py:attr:`~set_names` with ``in_place = True``. As this mutates the original object, a warning is raised. """ warn( "Setting property 'names' is an in-place operation, use 'set_names' instead", UserWarning, ) self.set_names(names=names, in_place=True) ############################# ######>> unlist_data <<###### #############################
[docs] def get_unlist_data(self) -> Any: """Get all elements.""" return self._unlist_data
[docs] def set_unlist_data(self, unlist_data: Any, in_place: bool = False) -> "CompressedList": """Set new list elements. Args: unlist_data: New vector-like object containing concatenated elements. in_place: Whether to modify the ``CompressedList`` in place. Returns: A modified ``CompressedList`` object, either as a copy of the original or as a reference to the (in-place-modified) original. """ output = self._define_output(in_place) _validate_data_and_partitions(unlist_data=unlist_data, partition=self._partitioning) output._unlist_data = unlist_data return output
@property def unlist_data(self) -> Any: """Alias for :py:attr:`~get_unlist_data`.""" return self.get_unlist_data() @unlist_data.setter def unlist_data(self, unlist_data: Any): """Alias for :py:attr:`~set_unlist_data` with ``in_place = True``. As this mutates the original object, a warning is raised. """ warn( "Setting property 'unlist_data' is an in-place operation, use 'set_unlist_data' instead", UserWarning, ) self.set_unlist_data(unlist_data, in_place=True) ################################### ######>> element metadata <<####### ###################################
[docs] def get_element_metadata(self) -> dict: """ Returns: Dictionary of metadata for each element in this object. """ return self._element_metadata
[docs] def set_element_metadata(self, element_metadata: dict, in_place: bool = False) -> "CompressedList": """Set new element metadata. Args: element_metadata: New element metadata for this object. in_place: Whether to modify the ``CompressedList`` in place. Returns: A modified ``CompressedList`` object, either as a copy of the original or as a reference to the (in-place-modified) original. """ if not isinstance(element_metadata, dict): raise TypeError(f"`element_metadata` must be a dictionary, provided {type(element_metadata)}.") output = self._define_output(in_place) output._element_metadata = element_metadata return output
@property def element_metadata(self) -> dict: """Alias for :py:attr:`~get_element_metadata`.""" return self.get_element_metadata() @element_metadata.setter def element_metadata(self, element_metadata: dict): """Alias for :py:attr:`~set_element_metadata` with ``in_place = True``. As this mutates the original object, a warning is raised. """ warn( "Setting property 'element_metadata' is an in-place operation, use 'set_element_metadata' instead", UserWarning, ) self.set_element_metadata(element_metadata, in_place=True) ########################### ######>> metadata <<####### ###########################
[docs] def get_metadata(self) -> dict: """ Returns: Dictionary of metadata for this object. """ return self._metadata
[docs] def set_metadata(self, metadata: dict, in_place: bool = False) -> "CompressedList": """Set additional metadata. Args: metadata: New metadata for this object. in_place: Whether to modify the ``CompressedList`` in place. Returns: A modified ``CompressedList`` object, either as a copy of the original or as a reference to the (in-place-modified) original. """ if not isinstance(metadata, dict): raise TypeError(f"`metadata` must be a dictionary, provided {type(metadata)}.") output = self._define_output(in_place) output._metadata = metadata return output
@property def metadata(self) -> dict: """Alias for :py:attr:`~get_metadata`.""" return self.get_metadata() @metadata.setter def metadata(self, metadata: dict): """Alias for :py:attr:`~set_metadata` with ``in_place = True``. As this mutates the original object, a warning is raised. """ warn( "Setting property 'metadata' is an in-place operation, use 'set_metadata' instead", UserWarning, ) self.set_metadata(metadata, in_place=True) ########################## ######>> accessors <<##### ##########################
[docs] def __getitem__(self, key: Union[int, str, slice]) -> Any: """Get an element or slice of elements from the list. Args: key: Integer index, string name, or slice. Returns: List element(s). """ # string keys (names) if isinstance(key, str): if key not in list(self.get_names()): raise KeyError(f"No element named '{key}'.") key = list(self.names).index(key) # integer indices if isinstance(key, int): if key < 0: key += len(self) if key < 0 or key >= len(self): raise IndexError(f"List index '{key}' out of range.") start, end = self._partitioning.get_partition_range(key) return self.extract_range(start, end) # slices elif isinstance(key, slice): indices = range(*key.indices(len(self))) result = [] for i in indices: start, end = self._partitioning.get_partition_range(i) result.append(self.extract_range(start, end)) current_class_const = type(self) return current_class_const.from_list( result, names=[self.names[i] for i in indices] if self.names[0] is not None else None ) else: raise TypeError("'key' must be int, str, or slice.")
################################## ######>> abstract methods <<###### ##################################
[docs] def extract_range(self, start: int, end: int) -> Any: """Extract a range from `unlist_data`. This method must be implemented by subclasses to handle type-specific extraction from `unlist_data`. Args: start: Start index (inclusive). end: End index (exclusive). Returns: Extracted element. """ try: return self._unlist_data[start:end] except Exception as e: raise NotImplementedError( "Custom classes should implement their own `extract_range` method for slice operations" ) from e
[docs] @classmethod def from_list( cls, lst: List[Any], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None ) -> "CompressedList": """Create a CompressedList from a regular list. This method must be implemented by subclasses to handle type-specific conversion from list to unlist_data. Args: lst: List to convert. names: Optional names for list elements. metadata: Optional metadata. Returns: A new `CompressedList`. """ # Flatten the list flat_data = [] for sublist in lst: flat_data.extend(sublist) # Create partitioning partitioning = Partitioning.from_list(lst, names) # Create unlist_data # unlist_data = cls._element_type(data=flat_data) return cls(flat_data, partitioning, metadata=metadata)
########################### ######>> coercions <<###### ###########################
[docs] def to_list(self) -> List[Any]: """Convert to a regular Python list. Returns: A regular Python list with all elements. """ return list(self)
[docs] def unlist(self, use_names: bool = True) -> Any: """Get the underlying unlisted data. Args: use_names: Whether to include names in the result if applicable. Currently not used. Returns: The unlisted data. """ return self._unlist_data
[docs] def relist(self, unlist_data: Any) -> "CompressedList": """Create a new `CompressedList` with the same partitioning but different data. Args: unlist_data: New unlisted data. Returns: A new CompressedList. """ _validate_data_and_partitions(unlist_data, self._partitioning) current_class_const = type(self) return current_class_const( unlist_data, self._partitioning.copy(), element_type=self._element_type, element_metadata=self._element_metadata.copy(), metadata=self._metadata.copy(), )
[docs] def extract_subset(self, indices: Sequence[int]) -> "CompressedList": """Extract a subset of elements by indices. Args: indices: Sequence of indices to extract. Returns: A new CompressedList with only the selected elements. """ # Validate indices for i in indices: if i < 0 or i >= len(self): raise IndexError(f"Index {i} out of range") # Extract element lengths and names new_lengths = ut.subset_sequence(self.get_element_lengths(), indices) new_names = ut.subset_sequence(self.names, indices) if self.names is not None else None # Create new partitioning new_partitioning = Partitioning.from_lengths(new_lengths, new_names) # Extract data new_data = [] for i in indices: start, end = self._partitioning.get_partition_range(i) if isinstance(self._unlist_data, np.ndarray): new_data.append(self._unlist_data[start:end]) else: new_data.extend(self._unlist_data[start:end]) if isinstance(self._unlist_data, np.ndarray): new_data = np.concatenate(new_data) current_class_const = type(self) return current_class_const( new_data, new_partitioning, element_type=self._element_type, element_metadata={k: v for k, v in self._element_metadata.items() if k in indices}, metadata=self._metadata.copy(), )
[docs] def lapply(self, func: Callable) -> "CompressedList": """Apply a function to each element. Args: func: Function to apply to each element. Returns: A new CompressedList with the results. """ result = [func(elem) for elem in self] current_class_const = type(self) return current_class_const.from_list(result, self.names, self._metadata)