Source code for hdf5array.Hdf5CompressedSparseMatrixSeed

from bisect import bisect_left
from typing import Callable, Literal, Optional, Sequence, Tuple

import numpy
from biocutils.package_utils import is_package_installed
from delayedarray import (
    DelayedArray,
    SparseNdarray,
    chunk_grid,
    chunk_shape_to_grid,
    extract_dense_array,
    extract_sparse_array,
    is_masked,
    is_sparse,
    wrap,
)
from h5py import File
from numpy import asarray, dtype, integer, issubdtype, zeros

__author__ = "LTLA"
__copyright__ = "LTLA"
__license__ = "MIT"


[docs] class Hdf5CompressedSparseMatrixSeed: """Compressed sparse matrix stored in a HDF5 file, represented as a ``DelayedArray`` seed. This assumes that there are three datasets; ``data`` containing the data values, ``indices`` containing the indices, and ``indptr`` containing the pointers to the start of every row/column. """ def __init__( self, path: str, group_name: Optional[str], shape: Tuple[int, int], by_column: bool, dtype: Optional[dtype] = None, index_dtype: Optional[dtype] = None, data_name: Optional[str] = None, indices_name: Optional[str] = None, indptr_name: Optional[str] = None, ): """ Args: path: Path to the HDF5 file. group_name: Name of the group containing the sparse matrix's contents. This can also be None in which case ``data_name``, ``indices_name`` and ``indptr_name`` should be specified. shape: Tuple of length 2 specifying the shape of the matrix. by_column: Whether this is a compressed sparse column matrix. If False, the data is treated as a compressed sparse row matrix. dtype: NumPy type of the data. Defaults to the HDF5 type on disk; otherwise, values are transformed to ``dtype`` during extraction. index_dtype: NumPy type of the indices. Defaults to the HDF5 type on disk; otherwise, values are transformed to ``dtype`` during extraction. data_name: Name of the dataset containing the data values. Defaults to ``group_name`` plus ``/data``. indices_name: Name of the dataset containing the indices. Defaults to ``group_name`` plus ``/indices``. indptr_name: Name of the dataset containing the pointers. Defaults to ``group_name`` plus ``/indptr``. """ self._path = path self._group_name = group_name self._shape = shape self._by_column = by_column if data_name is None: data_name = group_name + "/data" self._data_name = data_name if indices_name is None: indices_name = group_name + "/indices" self._indices_name = indices_name if indptr_name is None: indptr_name = group_name + "/indptr" self._indptr_name = indptr_name with File(self._path, "r") as handle: self._indptr = handle[self._indptr_name][:] if len(self._indptr.shape) != 1 or not issubdtype(self._indptr.dtype, integer): raise ValueError("'indptr' dataset should be 1-dimensional and contain integers") if by_column: if len(self._indptr) != shape[1] + 1: raise ValueError("'indptr' dataset should have length equal to the number of columns + 1") else: if len(self._indptr) != shape[0] + 1: raise ValueError("'indptr' dataset should have length equal to the number of columns + 1") if self._indptr[0] != 0: raise ValueError("first entry of 'indptr' dataset should be zero") for i in range(1, len(self._indptr)): if self._indptr[i] < self._indptr[i - 1]: raise ValueError("entries of 'indptr' should be ordered") ddset = handle[self._data_name] if len(ddset.shape) != 1 or ddset.shape[0] != self._indptr[-1]: raise ValueError("'data' dataset should have length equal to the number of non-zero elements") self._modify_dtype = dtype is not None and dtype != ddset.dtype if not self._modify_dtype: dtype = ddset.dtype self._dtype = dtype # Not going to check for consistency of the indices themselves. idset = handle[self._indices_name] if len(idset.shape) != 1 or idset.shape[0] != self._indptr[-1]: raise ValueError("'indices' dataset should have length equal to the number of non-zero elements") if not issubdtype(idset.dtype, integer): raise ValueError("'indices' dataset should contain integers") self._modify_index_dtype = index_dtype is not None and index_dtype != idset.dtype if not self._modify_index_dtype: index_dtype = idset.dtype self._index_dtype = index_dtype @property def dtype(self) -> dtype: """ Returns: NumPy type of this matrix. """ return self._dtype @property def shape(self) -> Tuple[int, int]: """ Returns: Tuple containing the dimensions of this matrix. """ return self._shape @property def path(self) -> str: """ Returns: Path to the HDF5 file. """ return self._path @property def index_dtype(self) -> dtype: """ Returns: NumPy type of the indices in this matrix. """ return self._index_dtype @property def by_column(self) -> bool: """ Returns: Whether the matrix is compressed sparse column. """ return self._by_column @property def group_name(self) -> Optional[str]: """ Returns: Name of the HDF5 group containing the matrix contents, or None if the matrix is not contained within a single group. """ return self._group_name @property def data_name(self) -> str: """ Returns: Name of the HDF5 dataset containing the matrix data values. """ return self._data_name @property def indices_name(self) -> str: """ Returns: Name of the HDF5 dataset containing the matrix indices. """ return self._indices_name @property def indptr_name(self) -> str: """ Returns: Name of the HDF5 dataset containing the matrix pointers. """ return self._indptr_name
[docs] @is_sparse.register def is_sparse_Hdf5CompressedSparseMatrixSeed(x: Hdf5CompressedSparseMatrixSeed): """See :py:meth:`~delayedarray.is_sparse.is_sparse`.""" return True
[docs] @chunk_grid.register def chunk_grid_Hdf5CompressedSparseMatrixSeed(x: Hdf5CompressedSparseMatrixSeed): """See :py:meth:`~delayedarray.chunk_grid.chunk_grid`. The cost factor is set to 20 to reflect the computational work involved in extracting data from disk. """ if x._by_column: chunks = (x._shape[0], 1) else: chunks = (1, x._shape[1]) return chunk_shape_to_grid(chunks, x.shape, cost_factor=20)
def _extract_array( x: Hdf5CompressedSparseMatrixSeed, primary_sub: Sequence[int], secondary_sub: Sequence[int], secondary_len: int, f_individual: Callable, f_consecutive: Callable, ): if len(secondary_sub) == 0: return secondary_start = secondary_sub[0] secondary_end = secondary_sub[-1] + 1 is_consecutive = secondary_end - secondary_start == len(secondary_sub) search_start = secondary_start > 0 search_end = secondary_end < secondary_len with File(x._path, "r") as handle: data = handle[x._data_name] indices = handle[x._indices_name] for i, p in enumerate(primary_sub): start_pos = x._indptr[p] end_pos = x._indptr[p + 1] curdata = data[start_pos:end_pos] curindices = indices[start_pos:end_pos] start_idx = 0 if search_start: start_idx = bisect_left(curindices, secondary_start) end_idx = len(curindices) if search_end: end_idx = bisect_left(curindices, secondary_end, lo=start_idx, hi=end_idx) if is_consecutive: mod_indices = curindices[start_idx:end_idx] if search_start: mod_indices -= secondary_start f_consecutive(i, mod_indices, curdata[start_idx:end_idx]) else: p = 0 for j in range(start_idx, end_idx): curi = curindices[j] while p < len(secondary_sub) and secondary_sub[p] < curi: p += 1 if p == len(secondary_sub): break if secondary_sub[p] == curi: f_individual(i, p, curdata[j]) p += 1
[docs] @extract_dense_array.register def extract_dense_array_Hdf5CompressedSparseMatrixSeed( x: Hdf5CompressedSparseMatrixSeed, subset: Tuple[Sequence[int], ...] ) -> numpy.ndarray: """See :py:meth:`~delayedarray.extract_dense_array.extract_dense_array`.""" output = zeros((len(subset[0]), len(subset[1])), dtype=x.dtype, order="F") if x._by_column: primary_sub = subset[1] secondary_sub = subset[0] secondary_len = x.shape[0] def _individual(c, r, value): output[r, c] = value def _consecutive(c, rows, values): output[rows, c] = values else: primary_sub = subset[0] secondary_sub = subset[1] secondary_len = x.shape[1] def _individual(r, c, value): output[r, c] = value def _consecutive(r, cols, values): output[r, cols] = values _extract_array( x=x, primary_sub=primary_sub, secondary_sub=secondary_sub, secondary_len=secondary_len, f_individual=_individual, f_consecutive=_consecutive, ) return output
[docs] @extract_sparse_array.register def extract_sparse_array_Hdf5CompressedSparseMatrixSeed( x: Hdf5CompressedSparseMatrixSeed, subset: Tuple[Sequence[int], ...] ) -> SparseNdarray: """See :py:meth:`~delayedarray.extract_sparse_array.extract_sparse_array`.""" if x._by_column: primary_sub = subset[1] secondary_sub = subset[0] # primary_len = x.shape[1] secondary_len = x.shape[0] else: primary_sub = subset[0] secondary_sub = subset[1] # primary_len = x.shape[0] secondary_len = x.shape[1] output = [] for i in range(len(subset[1])): output.append(([], [])) if x._by_column: def _individual(c, r, value): output[c][0].append(r) output[c][1].append(value) def _consecutive(c, rows, values): output[c] = (rows, values) else: def _individual(r, c, value): output[c][0].append(r) output[c][1].append(value) def _consecutive(r, cols, values): for j, c in enumerate(cols): output[c][0].append(r) output[c][1].append(values[j]) _extract_array( x=x, primary_sub=primary_sub, secondary_sub=secondary_sub, secondary_len=secondary_len, f_individual=_individual, f_consecutive=_consecutive, ) all_none = True for i, con in enumerate(output): if len(con[0]) == 0: output[i] = None else: output[i] = ( asarray(con[0], dtype=x._index_dtype), asarray(con[1], dtype=x._dtype), ) all_none = False if all_none: output = None return SparseNdarray( shape=(len(subset[0]), len(subset[1])), contents=output, dtype=x._dtype, index_dtype=x._index_dtype, check=False, )
[docs] class Hdf5CompressedSparseMatrix(DelayedArray): """Compressed sparse matrix in a HDF5 file as a ``DelayedArray``.""" def __init__(self, path: str, group_name: Optional[str], shape: Tuple[int, int], by_column: bool, **kwargs): """To construct a ``Hdf5CompressedSparseMatrix`` from an existing :py:class:`~Hdf5CompressedSparseMatrixSeed`, use :py:meth:`~delayedarray.wrap.wrap` instead. Args: path: Path to the HDF5 file. group_name: Name of the dataset containing the array. shape: Tuple of length 2 specifying the shape of the matrix. by_column: Whether this is a compressed sparse column matrix. If False, the data is treated as a compressed sparse row matrix. kwargs: Further arguments to pass to the :py:class:`~Hdf5CompressedSparseMatrixSeed` constructor. """ if isinstance(path, Hdf5CompressedSparseMatrixSeed): seed = path else: seed = Hdf5CompressedSparseMatrixSeed(path, group_name, shape, by_column, **kwargs) super(Hdf5CompressedSparseMatrix, self).__init__(seed) @property def path(self) -> str: """ Returns: Path to the HDF5 file. """ return self.seed.path @property def index_dtype(self) -> dtype: """ Returns: NumPy type of the indices in this matrix. """ return self.seed.index_dtype @property def by_column(self) -> bool: """ Returns: Whether the matrix is compressed sparse column. """ return self.seed.by_column @property def group_name(self) -> Optional[str]: """ Returns: Name of the HDF5 group containing the matrix contents, or None if the matrix is not contained within a single group. """ return self.seed.group_name @property def data_name(self) -> str: """ Returns: Name of the HDF5 dataset containing the matrix data values. """ return self.seed.data_name @property def indices_name(self) -> str: """ Returns: Name of the HDF5 dataset containing the matrix indices. """ return self.seed.indices_name @property def indptr_name(self) -> str: """ Returns: Name of the HDF5 dataset containing the matrix pointers. """ return self.seed.indptr_name
[docs] @wrap.register def wrap_Hdf5CompressedSparseMatrixSeed(x: Hdf5CompressedSparseMatrixSeed): """See :py:meth:`~delayedarray.wrap.wrap`.""" return Hdf5CompressedSparseMatrix(x, None, None, None)
[docs] @is_masked.register def is_masked_Hdf5CompressedSparseMatrixSeed(x: Hdf5CompressedSparseMatrixSeed) -> bool: """See :py:meth:`~delayedarray.is_masked.is_masked`.""" return False
if is_package_installed("scipy"): import scipy.sparse from delayedarray.to_scipy_sparse_matrix import to_scipy_sparse_matrix
[docs] @to_scipy_sparse_matrix.register def to_scipy_sparse_matrix_from_Hdf5CompressedSparseMatrix( x: Hdf5CompressedSparseMatrix, format: Literal["coo", "csr", "csc"] = "csc" ) -> scipy.sparse.spmatrix: """See :py:func:`delayedarray.to_scipy_sparse_matrix.to_scipy_sparse_matrix`.""" with File(x.path, "r") as handle: _data = handle[x.data_name][:] _indices = handle[x.indices_name][:] _indptr = handle[x.indptr_name][:] if x.by_column: _matrix = scipy.sparse.csc_matrix((_data, _indices, _indptr), shape=x.shape, dtype=x.dtype) else: _matrix = scipy.sparse.csr_matrix((_data, _indices, _indptr), shape=x.shape, dtype=x.dtype) if format == "csc": return _matrix.tocsc() elif format == "csr": return _matrix.tocsr() else: return _matrix.tocoo()