from dataclasses import dataclass
from typing import Optional, Sequence, Union
from biocframe import BiocFrame
from numpy import float64, uintp, zeros
from .. import _cpphelpers as lib
from .._utils import MatrixTypes, factorize, tatamize_input
__author__ = "ltla, jkanche"
__copyright__ = "ltla, jkanche"
__license__ = "MIT"
[docs]@dataclass
class ModelGeneVariancesOptions:
"""Optional arguments for :py:meth:`~scranpy.feature_selection.model_gene_variances.model_gene_variances`.
Attributes:
block:
Block assignment for each cell.
Variance modelling is performed within each block to avoid interference from inter-block differences.
If provided, this should have length equal to the number of cells, where cells have the same value if and
only if they are in the same block. Defaults to None, indicating all cells are part of the same block.
span:
Span to use for the LOWESS trend fitting.
Larger values yield a smoother curve and reduces the risk of overfitting,
at the cost of being less responsive to local variations.
Defaults to 0.3.
assay_type:
Assay to use from ``input`` if it is a
:py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`.
feature_names:
Sequence of feature names of length equal to the number of rows in ``input``.
If provided, this is used as the row names of the output data frames.
num_threads:
Number of threads to use. Defaults to 1.
"""
block: Optional[Sequence] = None
span: float = 0.3
assay_type: Union[int, str] = "logcounts"
feature_names: Optional[Sequence[str]] = None
num_threads: int = 1
[docs]def model_gene_variances(
input: MatrixTypes, options: ModelGeneVariancesOptions = ModelGeneVariancesOptions()
) -> BiocFrame:
"""Compute gene variances and model them with a trend to account for
non-trivial mean-variance relationships in count data. The residual from
the trend can then be used to identify highly variable genes, e.g., with
:py:meth:`~scranpy.feature_selection.choose_hvgs.choose_hvgs`.
Args:
input:
Matrix-like object where rows are features and columns are cells,
typically containing log-normalized expression values from
:py:meth:`~scranpy.normalization.log_norm_counts.log_norm_counts`.
This should be a matrix class that can be converted into a
:py:class:`~mattress.TatamiNumericPointer`.
Alternatively, a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
containing such a matrix in its assays.
Developers may also provide a :py:class:`~mattress.TatamiNumericPointer` directly.
options:
Optional parameters.
Returns:
Data frame with variance modelling results for each gene, specifically
the mean log-expression, the variance, the fitted value of the
mean-variance trend and the residual from the trend. Each row
of the data frame corresponds to a row of ``input``.
For multiple blocks, the data frame's columns will represent the average
across blocks. An extra ``per_block`` column will also be present
containing a nested :py:class:`~biocframe.BiocFrame.BiocFrame`
with the same per-block statistics.
"""
x = tatamize_input(input, options.assay_type)
NR = x.nrow()
means = zeros((NR,), dtype=float64)
variances = zeros((NR,), dtype=float64)
fitted = zeros((NR,), dtype=float64)
residuals = zeros((NR,), dtype=float64)
extra = None
if options.block is None:
lib.model_gene_variances(
x.ptr,
means,
variances,
fitted,
residuals,
options.span,
options.num_threads,
)
return BiocFrame(
{
"means": means,
"variances": variances,
"fitted": fitted,
"residuals": residuals,
},
number_of_rows=NR,
row_names=options.feature_names,
)
else:
NC = x.ncol()
if len(options.block) != NC:
raise ValueError(
f"Must provide block assignments (provided: {len(options.block)})"
f" for all cells (expected: {NC})."
)
block_levels, block_indices = factorize(options.block)
nlevels = len(block_levels)
all_means = []
all_variances = []
all_fitted = []
all_residuals = []
all_means_ptr = zeros((nlevels,), dtype=uintp)
all_variances_ptr = zeros((nlevels,), dtype=uintp)
all_fitted_ptr = zeros((nlevels,), dtype=uintp)
all_residuals_ptr = zeros((nlevels,), dtype=uintp)
for lvl in range(nlevels):
cur_means = zeros((NR,), dtype=float64)
cur_variances = zeros((NR,), dtype=float64)
cur_fitted = zeros((NR,), dtype=float64)
cur_residuals = zeros((NR,), dtype=float64)
all_means_ptr[lvl] = cur_means.ctypes.data
all_variances_ptr[lvl] = cur_variances.ctypes.data
all_fitted_ptr[lvl] = cur_fitted.ctypes.data
all_residuals_ptr[lvl] = cur_residuals.ctypes.data
all_means.append(cur_means)
all_variances.append(cur_variances)
all_fitted.append(cur_fitted)
all_residuals.append(cur_residuals)
lib.model_gene_variances_blocked(
x.ptr,
means,
variances,
fitted,
residuals,
nlevels,
block_indices.ctypes.data,
all_means_ptr.ctypes.data,
all_variances_ptr.ctypes.data,
all_fitted_ptr.ctypes.data,
all_residuals_ptr.ctypes.data,
options.span,
options.num_threads,
)
extra = {}
for i in range(nlevels):
extra[block_levels[i]] = BiocFrame(
{
"means": all_means[i],
"variances": all_variances[i],
"fitted": all_fitted[i],
"residuals": all_residuals[i],
},
row_names=options.feature_names,
)
return BiocFrame(
{
"means": means,
"variances": variances,
"fitted": fitted,
"residuals": residuals,
"per_block": BiocFrame(extra),
},
number_of_rows=NR,
row_names=options.feature_names,
)