Source code for singlecellexperiment.io.tenx

import os
from warnings import warn

from biocframe import BiocFrame, from_pandas

from ..SingleCellExperiment import SingleCellExperiment

__author__ = "jkanche, keviny2"
__copyright__ = "jkanche, keviny2"
__license__ = "MIT"



[docs]
def read_tenx_mtx(path: str) -> SingleCellExperiment:
    """Read 10X Matrix market directory as :py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment`.

    Args:
        path:
            Path to 10X MTX directory.

            Directory must contain `matrix.mtx`, and optionally
            `genes.tsv` (CellRanger version 2) or
            `features.tsv` (CellRanger version 3 and later) to
            represent features and `barcodes.tsv` for cell annotations.

    Returns:
        A single-cell experiment object.
    """

    import pandas as pd
    from scipy.io import mmread
    from scipy.sparse import csr_matrix

    mat = mmread(f"{path}/matrix.mtx")
    mat = csr_matrix(mat)

    features_path = path + "/features.tsv"
    genes_path = path + "/genes.tsv"
    if os.path.exists(features_path) and os.path.exists(genes_path):
        warn(
            "Both 'features.tsv' and 'genes.tsv' files are present in the directory. "
            "Prioritizing 'features.tsv' for processing."
        )

    if os.path.exists(features_path):
        genes = pd.read_csv(features_path, header=None, sep="\t", usecols=[0, 1])
    else:
        genes = pd.read_csv(genes_path, header=None, sep="\t")
    genes.columns = ["gene_ids", "gene_symbols"]

    cells = pd.read_csv(path + "/barcodes.tsv", header=None, sep="\t")
    cells.columns = ["barcode"]

    return SingleCellExperiment(
        assays={"counts": mat},
        row_data=from_pandas(genes),
        column_data=from_pandas(cells),
    )




[docs]
def read_tenx_h5(path: str, realize_assays: bool = False) -> SingleCellExperiment:
    """Read 10X H5 file as :py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment`.

    Note: Currently only supports version 3 of the 10X H5 format.

    Args:
        path:
            Path to 10x H5 file.

        realize_assays:
            Whether to realize assays into memory.
            Defaults to False.

    Returns:
        A single-cell experiment object.
    """

    import delayedarray as da
    import h5py
    from hdf5array import Hdf5CompressedSparseMatrix

    h5 = h5py.File(path, mode="r")

    if "matrix" not in h5.keys():
        raise ValueError(f"H5 file ({path}) is not a 10X V3 format.")

    groups = h5["matrix"].keys()

    # read the matrix
    shape = tuple(h5["matrix"]["shape"][:])

    counts = Hdf5CompressedSparseMatrix(path=path, group_name="matrix", by_column=True, shape=shape)

    if realize_assays is True:
        counts = da.to_scipy_sparse_matrix(counts, "csr")

    # read features
    features = None
    ignore_list = []
    if "features" in groups:
        features = {}
        for key, val in h5["matrix"]["features"].items():
            temp_features = [x.decode("ascii") for x in val]

            if len(temp_features) != counts.shape[0]:
                ignore_list.append(key)
            else:
                features[key] = temp_features

        features = BiocFrame(features, number_of_rows=counts.shape[0])

    if len(ignore_list) > 0:
        warn(
            f"These columns from h5 are ignored - {', '.join(ignore_list)} because of "
            "inconsistent length with the count matrix."
        )

    barcodes = None
    if "barcodes" in groups:
        barcodes = {}
        barcodes["barcodes"] = [x.decode("ascii") for x in h5["matrix"]["barcodes"]]
        barcodes = BiocFrame(barcodes, number_of_rows=counts.shape[1])

    h5.close()

    return SingleCellExperiment(assays={"counts": counts}, row_data=features, column_data=barcodes)