Source code for ensembldb.registry

import os
import sqlite3
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from pybiocfilecache import BiocFileCache

from ._ahub import AHUB_METADATA_URL
from .ensdb import EnsDb
from .record import EnsDbRecord

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


[docs] class EnsDbRegistry: """Registry for EnsDb resources."""
[docs] def __init__( self, cache_dir: Optional[Union[str, Path]] = None, force: bool = False, ) -> None: """Initialize the EnsDb registry. Args: cache_dir: Path to cache directory. force: Force re-download of metadata. """ if cache_dir is None: cache_dir = Path.home() / ".cache" / "ensembldb_bfc" self._cache_dir = Path(cache_dir) self._cache_dir.mkdir(parents=True, exist_ok=True) self._bfc = BiocFileCache(self._cache_dir) self._registry_map: Dict[str, EnsDbRecord] = {} self._initialize_registry(force=force)
def _initialize_registry(self, force: bool = False): """Populate registry from AnnotationHub metadata.""" rname = "annotationhub_metadata" existing = None try: existing = self._bfc.get(rname) except Exception: pass if force and existing: try: self._bfc.remove(rname) except Exception: pass existing = None if existing: md_resource = existing else: md_resource = self._bfc.add(rname, AHUB_METADATA_URL, rtype="web") md_path = self._get_filepath(md_resource) if not md_path or not os.path.exists(md_path): if existing and not force: return self._initialize_registry(force=True) raise RuntimeError("Failed to retrieve AnnotationHub metadata.") conn = sqlite3.connect(md_path) try: # Filter for EnsDb sqlite files # Updated query: Checks rdataclass AND rdatapath extension query = """ SELECT r.id, r.title, r.species, r.taxonomyid, r.genome, r.description, lp.location_prefix || rp.rdatapath AS full_url, r.rdatadateadded FROM resources r LEFT JOIN location_prefixes lp ON r.location_prefix_id = lp.id LEFT JOIN rdatapaths rp ON rp.resource_id = r.id WHERE (rp.rdataclass = 'EnsDb' OR r.title LIKE 'Ensembl % EnsDb%') AND rp.rdatapath LIKE '%.sqlite' ORDER BY r.rdatadateadded DESC; """ cursor = conn.cursor() cursor.execute(query) rows = cursor.fetchall() finally: conn.close() for row in rows: record = EnsDbRecord.from_db_row(row) self._registry_map[record.ensdb_id] = record
[docs] def list_ensdbs(self) -> List[str]: """List available EnsDb IDs.""" return sorted(list(self._registry_map.keys()))
[docs] def get_record(self, ensdb_id: str) -> EnsDbRecord: if ensdb_id not in self._registry_map: raise KeyError(f"ID '{ensdb_id}' not found.") return self._registry_map[ensdb_id]
[docs] def download(self, ensdb_id: str, force: bool = False) -> str: record = self.get_record(ensdb_id) url = record.url key = ensdb_id if force: try: self._bfc.remove(key) except Exception: pass if not force: try: existing = self._bfc.get(key) if existing: path = self._get_filepath(existing) if path and os.path.exists(path) and os.path.getsize(path) > 0: return path except Exception: pass resource = self._bfc.add( key, url, rtype="web", download=True, ) path = self._get_filepath(resource) if not path or not os.path.exists(path) or os.path.getsize(path) == 0: try: self._bfc.remove(key) except Exception: pass raise RuntimeError(f"Download failed for {ensdb_id}.") return path
[docs] def load_db(self, ensdb_id: str, force: bool = False) -> EnsDb: path = self.download(ensdb_id, force=force) return EnsDb(path)
def _get_filepath(self, resource: Any) -> Optional[str]: if hasattr(resource, "rpath"): rel_path = str(resource.rpath) elif hasattr(resource, "get"): rel_path = str(resource.get("rpath")) else: return None return str(self._cache_dir / rel_path)