Source code for genomicranges.io.gtf

import logging
from typing import Dict, List, Union

# Variation of https://github.com/epiviz/epivizfileserver/src/epivizfileserver/cli.py

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


def _parse_all_attribute(row: str) -> Dict:
    """Extract all keys from the gtf/gff attribute string.

    Args:
        row:
            A row from GTF.

    Returns:
        A dictionary containing extracted keys and their values.
    """
    attr = row["group"]
    infos = attr.split(";")
    vals = {}
    for info in infos:
        if info == "" or len(info) == 0:
            continue
        imap = info.strip().split(" ", 1)
        vals[imap[0]] = imap[1].strip().strip('"')
    return {**row, **vals}


[docs] def parse_gtf( path: str, compressed: bool, skiprows: Union[int, List[int]] = None, comment: str = "#", ): """Read a GTF file as :py:class:`~pandas.DataFrame`. Args: path: Path to the GTF file. compressed: Whether the file is gzip compressed. skiprows: Rows to skip if the gtf file has header. comment: Character indicating that the line should not be parsed. Defaults to "#". Returns: Pandas DataFrame containing annotations from GTF. """ from joblib import Parallel, delayed from pandas import DataFrame, read_csv logging.info(f"Reading File - {path}") if compressed: df = read_csv( path, sep="\t", names=[ "seqnames", "source", "feature", "starts", "ends", "score", "strand", "frame", "group", ], compression="gzip", skiprows=skiprows, comment=comment, ) else: df = read_csv( path, sep="\t", names=[ "seqnames", "source", "feature", "starts", "ends", "score", "strand", "frame", "group", ], skiprows=skiprows, comment=comment, ) rows = Parallel(n_jobs=-2)( delayed(_parse_all_attribute)(row) for _, row in df.iterrows() ) gtf = DataFrame.from_records(rows) gtf.drop(["group"], axis=1) return gtf
[docs] def read_gtf( file: str, skiprows: Union[int, List[int]] = None, comment: str = "#", ) -> "GenomicRanges": """Read a GTF file as :py:class:`~genomicranges.GenomicRanges.GenomicRanges`. Args: file: Path to GTF file. skiprows: Rows to skip if the gtf file has header. comment: Character indicating that the line should not be parsed. Defaults to "#". Returns: Genomic Ranges with annotations from the GTF file. """ compressed = True if file.endswith("gz") else False data = parse_gtf(file, compressed=compressed, skiprows=skiprows, comment=comment) from ..GenomicRanges import GenomicRanges return GenomicRanges.from_pandas(data)