Source code for genomicranges.SeqInfo

from typing import Dict, List, Optional, Sequence, Union
from warnings import warn

import biocutils as ut

from .utils import _sanitize_vec

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


def _validate_seqnames(seqnames):
    if not ut.is_list_of_type(seqnames, str):
        raise ValueError("'seqnames' should be a list of strings.")

    n = len(seqnames)
    if n != len(set(seqnames)):
        raise ValueError("'seqnames' should contain unique strings.")


def _validate_seqlengths(seqlengths, num_seqs):
    if not ut.is_list_of_type(seqlengths, int, ignore_none=True):
        raise ValueError("'seqlengths' should be a list of integers.")

    if num_seqs != len(seqlengths):
        raise ValueError("'seqnames' and 'seqlengths' should have the same length.")

    for sl in seqlengths:
        if sl is not None and sl < 0:
            raise ValueError("all entries of 'seqlengths' should be non-negative.")


def _validate_is_circular(is_circular, num_seqs):
    if not ut.is_list_of_type(is_circular, bool, ignore_none=True):
        raise ValueError("'is_circular' should be a list of booleans.")

    if num_seqs != len(is_circular):
        raise ValueError("'seqnames' and 'is_circular' should have the same length.")


def _validate_genome(genome, num_seqs):
    if not ut.is_list_of_type(genome, str, ignore_none=True):
        raise ValueError("'genome' should be a list of strings.")

    if num_seqs != len(genome):
        raise ValueError("'seqnames' and 'genome' should have the same length.")


[docs] class SeqInfoIterator: """An iterator to a :py:class:`~SeqInfo` object."""
[docs] def __init__(self, obj: "SeqInfo") -> None: """Initialize the iterator. Args: obj: Source object to iterate. """ self._sinfo = obj self._current_index = 0
[docs] def __iter__(self): return self
[docs] def __next__(self): if self._current_index < len(self._sinfo): iter_row_index = self._sinfo._seqnames[self._current_index] iter_slice = self._sinfo[self._current_index] self._current_index += 1 return (iter_row_index, iter_slice) raise StopIteration
[docs] class SeqInfo: """Information about the reference sequences, specifically the name and length of each sequence, whether it is a circular, and the identity of the genome from which it was derived."""
[docs] def __init__( self, seqnames: Sequence[str], seqlengths: Optional[Union[int, Sequence[int], Dict[str, int]]] = None, is_circular: Optional[Union[bool, Sequence[bool], Dict[str, bool]]] = None, genome: Optional[Union[str, Sequence[str], Dict[str, str]]] = None, validate: bool = True, ) -> None: """ Args: seqnames: Names of all reference sequences, should be unique. seqlengths: Lengths of all sequences in base pairs. This should contain non-negative values and have the same number of elements as ``seqnames``. Entries may also be None if no lengths are available for that sequence. Alternatively, a dictionary where keys are the sequence names and values are the lengths. If a name is missing from this dictionary, the length of the sequence is set to None. Alternatively a single integer, if all sequences are of the same length. Alternatively None, if no length information is available for any sequence. is_circular: Whether each sequence is circular. This should have the same number of elements as ``seqnames``. Entries may also be None if no information is available for that sequence. Alternatively, a dictionary where keys are the sequence names and values are the circular flags. If a name is missing from this dictionary, the flag for the sequence is set to None. Alternatively a single boolean, if all sequences have the same circular flag. Alternatively None, if no flags are available for any sequence. genome: The genome build containing each reference sequence. This should have the same number of elements as ``seqnames``. Entries may also be None if no information is available. Alternatively, a dictionary where keys are the sequence names and values are the genomes. If a name is missing from this dictionary, the genome is set to None. Alternatively a single string, if all sequences are derived from the same genome. Alternatively None, if no genome information is available for any sequence. validate: Whether to validate the arguments, internal use only. """ self._seqnames = list(seqnames) self._reverse_seqnames = None self._seqlengths = self._flatten_incoming(seqlengths, int) self._is_circular = self._flatten_incoming(is_circular, bool) self._genome = self._flatten_incoming(genome, str) if validate: _validate_seqnames(self._seqnames) num_seqs = len(self._seqnames) _validate_seqlengths(self._seqlengths, num_seqs) _validate_is_circular(self._is_circular, num_seqs) _validate_genome(self._genome, num_seqs)
def _populate_reverse_seqnames_index(self): if self._reverse_seqnames is None: revmap = {} for i, n in enumerate(self._seqnames): if n not in revmap: revmap[n] = i self._reverse_seqnames = revmap def _wipe_reverse_seqnames_index(self): self._reverse_seqnames = None def _flatten_incoming(self, values, expected) -> List: if values is None or isinstance(values, expected): return [values] * len(self) if isinstance(values, dict): output = [] for n in self._seqnames: if n in values: output.append(values[n]) else: output.append(None) return output values = _sanitize_vec(values) if isinstance(values, list): return values return list(values) def _define_output(self, in_place: bool = False) -> "SeqInfo": if in_place is True: return self else: return self.__copy__() ######################### ######>> Copying <<###### #########################
[docs] def __deepcopy__(self, memo=None, _nil=[]): """ Returns: A deep copy of the current ``SeqInfo``. """ from copy import deepcopy _seqnames_copy = deepcopy(self._seqnames) _seqlengths_copy = deepcopy(self._seqlengths) _is_circular_copy = deepcopy(self._is_circular) _genome_copy = deepcopy(self._genome) current_class_const = type(self) return current_class_const( seqnames=_seqnames_copy, seqlengths=_seqlengths_copy, is_circular=_is_circular_copy, genome=_genome_copy, validate=False, )
[docs] def __copy__(self): """ Returns: A shallow copy of the current ``SeqInfo``. """ current_class_const = type(self) return current_class_const( self._seqnames, self._seqlengths, self._is_circular, self._genome, validate=False, )
[docs] def copy(self): """Alias for :py:meth:`~__copy__`.""" return self.__copy__()
########################## ######>> Printing <<###### ##########################
[docs] def __repr__(self) -> str: """ Returns: A string representation of this ``SeqInfo``. """ output = "SeqInfo(number_of_seqnames=" + str(len(self)) output += ", seqnames=" + ut.print_truncated_list(self._seqnames) output += ", seqlengths=" + repr(self._seqlengths) output += ", is_circular=" + ut.print_truncated_list(self._is_circular) output += ", genome=" + ut.print_truncated_list(self._genome) output += ")" return output
def __str__(self) -> str: """ Returns: A pretty-printed string containing the contents of this ``SeqInfo``. """ output = f"SeqInfo with {len(self)} sequence{'s' if len(self) != 1 else ''}\n" nr = len(self) added_table = False if nr: if nr <= 10: indices = range(nr) insert_ellipsis = False else: indices = [0, 1, 2, nr - 3, nr - 2, nr - 1] insert_ellipsis = True raw_floating = ut.create_floating_names(None, indices) if insert_ellipsis: raw_floating = raw_floating[:3] + [""] + raw_floating[3:] floating = ["", ""] + raw_floating columns = [] header = ["seqnames", "<str>"] showed = [f"{self._seqnames[x]}" for x in indices] if insert_ellipsis: showed = showed[:3] + ["..."] + showed[3:] columns.append(header + showed) header = ["seqlengths", f"<{ut.print_type(self._seqlengths)}>"] showed = [f"{self._seqlengths[x]}" for x in indices] if insert_ellipsis: showed = showed[:3] + ["..."] + showed[3:] columns.append(header + showed) header = ["is_circular", f"<{ut.print_type(self._is_circular)}>"] showed = [f"{self._is_circular[x]}" for x in indices] if insert_ellipsis: showed = showed[:3] + ["..."] + showed[3:] columns.append(header + showed) header = ["genome", f"<{ut.print_type(self._genome)}>"] showed = [f"{self._genome[x]}" for x in indices] if insert_ellipsis: showed = showed[:3] + ["..."] + showed[3:] columns.append(header + showed) output += ut.print_wrapped_table(columns, floating_names=floating) added_table = True footer = [] if len(footer): if added_table: output += "\n------\n" output += "\n".join(footer) return output ########################## ######>> seqnames <<###### ##########################
[docs] def get_seqnames(self) -> List[str]: """ Returns: List of all chromosome names. """ return self._seqnames
[docs] def set_seqnames( self, seqnames: Sequence[str], in_place: bool = False ) -> "SeqInfo": """ Args: seqnames: List of sequence names, of length equal to the number of names in this ``SeqInfo`` object. All names should be unique strings. in_place: Whether to modify the ``SeqInfo`` object in place. Returns: A modified ``SeqInfo`` object, either as a copy of the original or as a reference to the (in-place-modified) original. """ _validate_seqnames(list(seqnames)) output = self._define_output(in_place) output._seqnames = list(seqnames) return output
@property def seqnames(self) -> List[str]: warn("'seqnames' is deprecated, use 'get_seqnames' instead", UserWarning) return self.get_seqnames() @seqnames.setter def seqnames(self, seqnames: Sequence[str]): warn( "Setting property 'seqnames' is an in-place operation, use 'set_seqnames' instead", UserWarning, ) self.set_seqnames(seqnames, in_place=True) ############################ ######>> seqlengths <<###### ############################
[docs] def get_seqlengths(self) -> List[int]: """ Returns: A list of integers is returned containing the lengths of all sequences, in the same order as the sequence names from :py:meth:`~get_seqnames`. """ return self._seqlengths
[docs] def set_seqlengths( self, seqlengths: Optional[Union[int, Sequence[int], Dict[str, int]]], in_place: bool = False, ) -> "SeqInfo": """ Args: seqlengths: List of sequence lengths, of length equal to the number of names in this ``SeqInfo`` object. Values may be None or non-negative integers. Alternatively, a dictionary where keys are the sequence names and values are the lengths. Not all names need to be present in which case the length is assumed to be None. in_place: Whether to modify the ``SeqInfo`` object in place. Returns: A modified ``SeqInfo`` object, either as a copy of the original or as a reference to the (in-place-modified) original. """ _seqlengths = self._flatten_incoming(seqlengths, int) _validate_seqlengths(_seqlengths, len(self)) output = self._define_output(in_place) output._seqlengths = _seqlengths return output
@property def seqlengths(self) -> List[int]: warn( "'seqlengths' is deprecated, use 'get_seqlengths' instead", UserWarning, ) return self.get_seqlengths() @seqlengths.setter def seqlengths( self, seqlengths: Optional[Union[int, Sequence[int], Dict[str, int]]] ): warn( "Setting property 'seqlengths' is an in-place operation, use 'set_seqlengths' instead", UserWarning, ) self.set_seqlengths(seqlengths, in_place=True) ############################# ######>> is-circular <<###### #############################
[docs] def get_is_circular(self) -> List[bool]: """ Returns: A list of booleans is returned specifying whether each sequence (from :py:meth:`~get_seqnames`) is circular. """ return self._is_circular
[docs] def set_is_circular( self, is_circular: Optional[Union[bool, Sequence[bool], Dict[str, bool]]], in_place: bool = False, ) -> "SeqInfo": """ Args: is_circular: List of circular flags, of length equal to the number of names in this ``SeqInfo`` object. Values may be None or booleans. Alternatively, a dictionary where keys are the sequence names and values are the flags. Not all names need to be present in which case the flag is assumed to be None. in_place: Whether to modify the ``SeqInfo`` object in place. Returns: A modified ``SeqInfo`` object, either as a copy of the original or as a reference to the (in-place-modified) original. """ _is_circular = self._flatten_incoming(is_circular, bool) _validate_is_circular(_is_circular, len(self)) output = self._define_output(in_place) output._is_circular = _is_circular return output
@property def is_circular(self) -> List[bool]: warn( "'is_circular' is deprecated, use 'get_is_circular' instead", UserWarning, ) return self.get_is_circular() @is_circular.setter def is_circular( self, is_circular: Optional[Union[bool, Sequence[bool], Dict[str, bool]]] ): warn( "Setting property 'is_circular' is an in-place operation, use 'set_is_circular' instead", UserWarning, ) self.set_is_circular(is_circular, in_place=True) ######################## ######>> genome <<###### ########################
[docs] def get_genome(self) -> List[str]: """ Returns: A list of strings is returned containing the genome identity for all sequences in :py:meth:`~get_seqnames`. """ return self._genome
[docs] def set_genome( self, genome: Optional[Union[str, Sequence[str], Dict[str, str]]], in_place: bool = False, ) -> "SeqInfo": """ Args: genome: List of genomes, of length equal to the number of names in this ``SeqInfo`` object. Values may be None or strings. in_place: Whether to modify the ``SeqInfo`` object in place. Returns: A modified ``SeqInfo`` object, either as a copy of the original or as a reference to the (in-place-modified) original. """ _genome = self._flatten_incoming(genome, str) _validate_genome(_genome, len(self)) output = self._define_output(in_place) output._genome = _genome return output
@property def genome(self) -> List[str]: warn("'genome' is deprecated, use 'get_genome' instead", UserWarning) return self.get_genome() @genome.setter def genome(self, genome: Optional[Union[bool, Sequence[bool], Dict[str, bool]]]): warn( "Setting property 'genome' is an in-place operation, use 'set_genome' instead", UserWarning, ) self.set_genome(genome, in_place=True) ###################################### ######>> length and iterators <<###### ######################################
[docs] def __len__(self) -> int: """ Returns: Number of sequences in this object. """ return len(self._seqnames)
[docs] def __iter__(self) -> SeqInfoIterator: """Iterator over sequences.""" return SeqInfoIterator(self)
######################### ######>> Slicers <<###### #########################
[docs] def get_subset(self, subset: Union[str, int, bool, Sequence]) -> "SeqInfo": """Subset ``SeqInfo``, based on their indices or seqnames. Args: subset: Indices to be extracted. This may be an integer, boolean, string, or any sequence thereof, as supported by :py:meth:`~biocutils.normalize_subscript.normalize_subscript`. Scalars are treated as length-1 sequences. Strings may only be used if :py:attr:``~seqnames`` are available (see :py:meth:`~get_seqnames`). The first occurrence of each string in the seqnames is used for extraction. Returns: A new ``SeqInfo`` object with the sequences of interest. """ if len(self) == 0: return SeqInfo.empty() idx, _ = ut.normalize_subscript(subset, len(self), self._seqnames) current_class_const = type(self) return current_class_const( seqnames=ut.subset_sequence(self._seqnames, idx), seqlengths=ut.subset_sequence(self._seqlengths, idx), is_circular=ut.subset_sequence(self._is_circular, idx), genome=ut.subset_sequence(self._genome, idx), )
[docs] def __getitem__(self, subset: Union[str, int, bool, Sequence]) -> "SeqInfo": """Alias to :py:attr:`~get_subset`.""" return self.get_subset(subset)
[docs] @classmethod def empty(cls): """Create an zero-length `SeqInfo` object. Returns: same type as caller, in this case a `SeqInfo`. """ return SeqInfo([], [], [], [])
@ut.combine_sequences.register def _combine_SeqInfo(*x: SeqInfo) -> SeqInfo: return merge_SeqInfo(x)
[docs] def merge_SeqInfo(objects: List[SeqInfo]) -> SeqInfo: """Merge multiple :py:class:`~SeqInfo` objects, taking the union of all reference sequences. If the same reference sequence is present with the same details across ``objects``, only a single instance is present in the final object; if details are contradictory, they are replaced with None. Args: objects: List of ``SeqInfo`` objects. Returns: A single merged ``SeqInfo`` object. """ all_sequences = {} for obj in objects: for i, y in enumerate(obj._seqnames): curlen = obj._seqlengths[i] curcir = obj._is_circular[i] curgen = obj._genome[i] if y not in all_sequences: all_sequences[y] = [curlen, curcir, curgen] else: present = all_sequences[y] prelen, precir, pregen = present if prelen != curlen: present[0] = None if precir != curcir: present[1] = None if pregen != curgen: present[2] = None out_names = [] out_lengths = [] out_circular = [] out_genome = [] for k, v in all_sequences.items(): out_names.append(k) out_lengths.append(v[0]) out_circular.append(v[1]) out_genome.append(v[2]) return SeqInfo(out_names, out_lengths, out_circular, out_genome, validate=False)