Source code for delayedarray.chunk_grid

from functools import singledispatch
from typing import Any, Tuple, Sequence
from numpy import ndarray
from biocutils.package_utils import is_package_installed

from .SparseNdarray import SparseNdarray
from .RegularTicks import RegularTicks
from .Grid import SimpleGrid, AbstractGrid

__author__ = "ltla"
__copyright__ = "ltla"
__license__ = "MIT"


[docs] def chunk_shape_to_grid(chunks: Sequence[int], shape: Tuple[int, ...], cost_factor: int) -> SimpleGrid: """ Convert a chunk shape to a :py:class:`~delayedarray.Grid.SimpleGrid`. This assumes that the underlying array is split up into regular intervals on each dimension; the first chunk should start from zero, and only the last chunk may be of a different size (bounded by the dimension extent). Args: chunks: Chunk size for each dimension. These should be positive. shape: Extent of each dimension of the array. These should be non-negative and of the same length as ``chunks``. cost_factor: Cost factor for iterating over each element of the associated array. This is used to decide between iteration schemes and can be increased for more expensive types, e.g., file-backed arrays. As a reference, in-memory NumPy arrays are assigned a cost factor of 1. Returns: A ``SimpleGrid`` object with the chunk shape as the boundaries. """ out = [] for i, ch in enumerate(chunks): sh = shape[i] if sh == 0: out.append([]) elif ch == sh: out.append([sh]) else: out.append(RegularTicks(ch, sh)) return SimpleGrid((*out,), cost_factor=cost_factor)
[docs] @singledispatch def chunk_grid(x: Any) -> AbstractGrid: """ Create a grid over the array, used to determine how a caller should iterate over that array. The intervals of the grid usually reflects a particular layout of the data on disk or in memory. Args: x: An array-like object. Returns: An instance of a :py:class:`~delayedarray.Grid.AbstractGrid`. """ raise NotImplementedError("'chunk_grid(" + str(type(x)) + ")' has not yet been implemented")
[docs] @chunk_grid.register def chunk_grid_ndarray(x: ndarray) -> SimpleGrid: """ See :py:meth:`~delayedarray.chunk_grid.chunk_grid`. The cost factor for iteration is set to 1, which is considered the lowest cost for data extraction given that everything is stored in memory. """ raw = [1] * len(x.shape) if x.flags.f_contiguous: raw[0] = x.shape[0] else: # Not sure how to deal with strided views here; not even sure how # to figure that out from NumPy flags. Guess we should just assume # that it's C-contiguous, given that most things are. raw[-1] = x.shape[-1] return chunk_shape_to_grid(raw, x.shape, cost_factor=1)
[docs] @chunk_grid.register def chunk_grid_SparseNdarray(x: SparseNdarray) -> SimpleGrid: """ See :py:meth:`~delayedarray.chunk_grid.chunk_grid`. The cost factor for iteration is set to 1.5. This is slightly higher than that of dense NumPy arrays as the ``SparseNdarray`` is a bit more expensive for random access on the first dimension. """ raw = [1] * len(x.shape) raw[0] = x.shape[0] return chunk_shape_to_grid(raw, x.shape, cost_factor=1.5)
# If scipy is installed, we add all the methods for the various scipy.sparse matrices. if is_package_installed("scipy"): import scipy.sparse as sp
[docs] @chunk_grid.register def chunk_grid_csc_matrix(x: sp.csc_matrix) -> SimpleGrid: """ See :py:meth:`~delayedarray.chunk_grid.chunk_grid`. The cost factor for iteration is set to 1.5. This is slightly higher than that of dense NumPy arrays as CSC matrices are a bit more expensive for random row access. """ return chunk_shape_to_grid((x.shape[0], 1), x.shape, cost_factor=1.5)
[docs] @chunk_grid.register def chunk_grid_csr_matrix(x: sp.csr_matrix) -> SimpleGrid: """ See :py:meth:`~delayedarray.chunk_grid.chunk_grid`. The cost factor for iteration is set to 1.5. This is slightly higher than that of dense NumPy arrays as CSR matrices are a bit more expensive for random column access. """ return chunk_shape_to_grid((1, x.shape[1]), x.shape, cost_factor=1.5)
[docs] @chunk_grid.register def chunk_grid_coo_matrix(x: sp.coo_matrix) -> SimpleGrid: """ See :py:meth:`~delayedarray.chunk_grid.chunk_grid`. The cost factor for iteration is set to 5, as any extraction from a COO matrix requires a full scan through all elements. """ # ???? let's just do our best here, there's no nice way to access COO. return chunk_shape_to_grid(x.shape, x.shape, cost_factor=5)