Source code for scranpy.feature_selection.model_gene_variances

from dataclasses import dataclass
from typing import Optional, Sequence, Union

from biocframe import BiocFrame
from numpy import float64, uintp, zeros

from .. import _cpphelpers as lib
from .._utils import MatrixTypes, factorize, tatamize_input

__author__ = "ltla, jkanche"
__copyright__ = "ltla, jkanche"
__license__ = "MIT"


[docs]@dataclass
class ModelGeneVariancesOptions:
    """Optional arguments for :py:meth:`~scranpy.feature_selection.model_gene_variances.model_gene_variances`.

    Attributes:
        block:
            Block assignment for each cell.
            Variance modelling is performed within each block to avoid interference from inter-block differences.

            If provided, this should have length equal to the number of cells, where cells have the same value if and
            only if they are in the same block. Defaults to None, indicating all cells are part of the same block.

        span:
            Span to use for the LOWESS trend fitting.
            Larger values yield a smoother curve and reduces the risk of overfitting,
            at the cost of being less responsive to local variations.
            Defaults to 0.3.

        assay_type:
            Assay to use from ``input`` if it is a
            :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`.

        feature_names:
            Sequence of feature names of length equal to the number of rows in ``input``.
            If provided, this is used as the row names of the output data frames.

        num_threads:
            Number of threads to use. Defaults to 1.
    """

    block: Optional[Sequence] = None
    span: float = 0.3
    assay_type: Union[int, str] = "logcounts"
    feature_names: Optional[Sequence[str]] = None
    num_threads: int = 1


[docs]def model_gene_variances(
    input: MatrixTypes, options: ModelGeneVariancesOptions = ModelGeneVariancesOptions()
) -> BiocFrame:
    """Compute gene variances and model them with a trend to account for
    non-trivial mean-variance relationships in count data. The residual from
    the trend can then be used to identify highly variable genes, e.g., with
    :py:meth:`~scranpy.feature_selection.choose_hvgs.choose_hvgs`.

    Args:
        input:
            Matrix-like object where rows are features and columns are cells,
            typically containing log-normalized expression values from
            :py:meth:`~scranpy.normalization.log_norm_counts.log_norm_counts`.
            This should be a matrix class that can be converted into a
            :py:class:`~mattress.TatamiNumericPointer`.

            Alternatively, a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
            containing such a matrix in its assays.

            Developers may also provide a :py:class:`~mattress.TatamiNumericPointer` directly.

        options:
            Optional parameters.

    Returns:
        Data frame with variance modelling results for each gene, specifically
        the mean log-expression, the variance, the fitted value of the
        mean-variance trend and the residual from the trend. Each row
        of the data frame corresponds to a row of ``input``.

        For multiple blocks, the data frame's columns will represent the average
        across blocks. An extra ``per_block`` column will also be present
        containing a nested :py:class:`~biocframe.BiocFrame.BiocFrame`
        with the same per-block statistics.
    """
    x = tatamize_input(input, options.assay_type)

    NR = x.nrow()
    means = zeros((NR,), dtype=float64)
    variances = zeros((NR,), dtype=float64)
    fitted = zeros((NR,), dtype=float64)
    residuals = zeros((NR,), dtype=float64)
    extra = None

    if options.block is None:
        lib.model_gene_variances(
            x.ptr,
            means,
            variances,
            fitted,
            residuals,
            options.span,
            options.num_threads,
        )

        return BiocFrame(
            {
                "means": means,
                "variances": variances,
                "fitted": fitted,
                "residuals": residuals,
            },
            number_of_rows=NR,
            row_names=options.feature_names,
        )
    else:
        NC = x.ncol()
        if len(options.block) != NC:
            raise ValueError(
                f"Must provide block assignments (provided: {len(options.block)})"
                f" for all cells (expected: {NC})."
            )

        block_levels, block_indices = factorize(options.block)
        nlevels = len(block_levels)

        all_means = []
        all_variances = []
        all_fitted = []
        all_residuals = []
        all_means_ptr = zeros((nlevels,), dtype=uintp)
        all_variances_ptr = zeros((nlevels,), dtype=uintp)
        all_fitted_ptr = zeros((nlevels,), dtype=uintp)
        all_residuals_ptr = zeros((nlevels,), dtype=uintp)

        for lvl in range(nlevels):
            cur_means = zeros((NR,), dtype=float64)
            cur_variances = zeros((NR,), dtype=float64)
            cur_fitted = zeros((NR,), dtype=float64)
            cur_residuals = zeros((NR,), dtype=float64)

            all_means_ptr[lvl] = cur_means.ctypes.data
            all_variances_ptr[lvl] = cur_variances.ctypes.data
            all_fitted_ptr[lvl] = cur_fitted.ctypes.data
            all_residuals_ptr[lvl] = cur_residuals.ctypes.data

            all_means.append(cur_means)
            all_variances.append(cur_variances)
            all_fitted.append(cur_fitted)
            all_residuals.append(cur_residuals)

        lib.model_gene_variances_blocked(
            x.ptr,
            means,
            variances,
            fitted,
            residuals,
            nlevels,
            block_indices.ctypes.data,
            all_means_ptr.ctypes.data,
            all_variances_ptr.ctypes.data,
            all_fitted_ptr.ctypes.data,
            all_residuals_ptr.ctypes.data,
            options.span,
            options.num_threads,
        )

        extra = {}
        for i in range(nlevels):
            extra[block_levels[i]] = BiocFrame(
                {
                    "means": all_means[i],
                    "variances": all_variances[i],
                    "fitted": all_fitted[i],
                    "residuals": all_residuals[i],
                },
                row_names=options.feature_names,
            )

        return BiocFrame(
            {
                "means": means,
                "variances": variances,
                "fitted": fitted,
                "residuals": residuals,
                "per_block": BiocFrame(extra),
            },
            number_of_rows=NR,
            row_names=options.feature_names,
        )