Source code for delayedarray.chunk_grid

from functools import singledispatch
from typing import Any, Tuple, Sequence
from numpy import ndarray
from biocutils.package_utils import is_package_installed

from .SparseNdarray import SparseNdarray
from .RegularTicks import RegularTicks
from .Grid import SimpleGrid, AbstractGrid

__author__ = "ltla"
__copyright__ = "ltla"
__license__ = "MIT"



[docs]
def chunk_shape_to_grid(chunks: Sequence[int], shape: Tuple[int, ...], cost_factor: int) -> SimpleGrid:
    """
    Convert a chunk shape to a :py:class:`~delayedarray.Grid.SimpleGrid`.
    This assumes that the underlying array is split up into regular intervals
    on each dimension; the first chunk should start from zero, and only the
    last chunk may be of a different size (bounded by the dimension extent).

    Args:
        chunks:
            Chunk size for each dimension. These should be positive.

        shape:
            Extent of each dimension of the array. These should be non-negative
            and of the same length as ``chunks``.

        cost_factor:
            Cost factor for iterating over each element of the associated
            array. This is used to decide between iteration schemes and can be
            increased for more expensive types, e.g., file-backed arrays. As a
            reference, in-memory NumPy arrays are assigned a cost factor of 1.

    Returns:
        A ``SimpleGrid`` object with the chunk shape as the boundaries.
    """
    out = []
    for i, ch in enumerate(chunks):
        sh = shape[i]
        if sh == 0:
            out.append([])
        elif ch == sh:
            out.append([sh])
        else:
            out.append(RegularTicks(ch, sh))
    return SimpleGrid((*out,), cost_factor=cost_factor)




[docs]
@singledispatch
def chunk_grid(x: Any) -> AbstractGrid:
    """
    Create a grid over the array, used to determine how a caller should iterate
    over that array. The intervals of the grid usually reflects a particular
    layout of the data on disk or in memory.

    Args:
        x: An array-like object.
    
    Returns:
        An instance of a :py:class:`~delayedarray.Grid.AbstractGrid`.
    """
    raise NotImplementedError("'chunk_grid(" + str(type(x)) + ")' has not yet been implemented")




[docs]
@chunk_grid.register
def chunk_grid_ndarray(x: ndarray) -> SimpleGrid:
    """
    See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.

    The cost factor for iteration is set to 1, which is considered the lowest
    cost for data extraction given that everything is stored in memory.
    """
    raw = [1] * len(x.shape)
    if x.flags.f_contiguous:
        raw[0] = x.shape[0]
    else:
        # Not sure how to deal with strided views here; not even sure how
        # to figure that out from NumPy flags. Guess we should just assume
        # that it's C-contiguous, given that most things are.
        raw[-1] = x.shape[-1]
    return chunk_shape_to_grid(raw, x.shape, cost_factor=1)




[docs]
@chunk_grid.register
def chunk_grid_SparseNdarray(x: SparseNdarray) -> SimpleGrid:
    """
    See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.

    The cost factor for iteration is set to 1.5. This is slightly higher than
    that of dense NumPy arrays as the ``SparseNdarray`` is a bit more expensive
    for random access on the first dimension.
    """
    raw = [1] * len(x.shape)
    raw[0] = x.shape[0]
    return chunk_shape_to_grid(raw, x.shape, cost_factor=1.5)



# If scipy is installed, we add all the methods for the various scipy.sparse matrices.

if is_package_installed("scipy"):
    import scipy.sparse as sp



[docs]
    @chunk_grid.register
    def chunk_grid_csc_matrix(x: sp.csc_matrix) -> SimpleGrid:
        """
        See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.

        The cost factor for iteration is set to 1.5. This is slightly higher
        than that of dense NumPy arrays as CSC matrices are a bit more
        expensive for random row access.
        """
        return chunk_shape_to_grid((x.shape[0], 1), x.shape, cost_factor=1.5)




[docs]
    @chunk_grid.register
    def chunk_grid_csr_matrix(x: sp.csr_matrix) -> SimpleGrid:
        """
        See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.

        The cost factor for iteration is set to 1.5. This is slightly higher
        than that of dense NumPy arrays as CSR matrices are a bit more
        expensive for random column access.
        """
        return chunk_shape_to_grid((1, x.shape[1]), x.shape, cost_factor=1.5)




[docs]
    @chunk_grid.register
    def chunk_grid_coo_matrix(x: sp.coo_matrix) -> SimpleGrid:
        """
        See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.

        The cost factor for iteration is set to 5, as any extraction from a COO
        matrix requires a full scan through all elements.
        """
        # ???? let's just do our best here, there's no nice way to access COO.
        return chunk_shape_to_grid(x.shape, x.shape, cost_factor=5)