Source code for ensembldb.registry
import os
import sqlite3
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from pybiocfilecache import BiocFileCache
from ._ahub import AHUB_METADATA_URL
from .ensdb import EnsDb
from .record import EnsDbRecord
__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"
[docs]
class EnsDbRegistry:
"""Registry for EnsDb resources."""
[docs]
def __init__(
self,
cache_dir: Optional[Union[str, Path]] = None,
force: bool = False,
) -> None:
"""Initialize the EnsDb registry.
Args:
cache_dir: Path to cache directory.
force: Force re-download of metadata.
"""
if cache_dir is None:
cache_dir = Path.home() / ".cache" / "ensembldb_bfc"
self._cache_dir = Path(cache_dir)
self._cache_dir.mkdir(parents=True, exist_ok=True)
self._bfc = BiocFileCache(self._cache_dir)
self._registry_map: Dict[str, EnsDbRecord] = {}
self._initialize_registry(force=force)
def _initialize_registry(self, force: bool = False):
"""Populate registry from AnnotationHub metadata."""
rname = "annotationhub_metadata"
existing = None
try:
existing = self._bfc.get(rname)
except Exception:
pass
if force and existing:
try:
self._bfc.remove(rname)
except Exception:
pass
existing = None
if existing:
md_resource = existing
else:
md_resource = self._bfc.add(rname, AHUB_METADATA_URL, rtype="web")
md_path = self._get_filepath(md_resource)
if not md_path or not os.path.exists(md_path):
if existing and not force:
return self._initialize_registry(force=True)
raise RuntimeError("Failed to retrieve AnnotationHub metadata.")
conn = sqlite3.connect(md_path)
try:
# Filter for EnsDb sqlite files
# Updated query: Checks rdataclass AND rdatapath extension
query = """
SELECT
r.id,
r.title,
r.species,
r.taxonomyid,
r.genome,
r.description,
lp.location_prefix || rp.rdatapath AS full_url,
r.rdatadateadded
FROM resources r
LEFT JOIN location_prefixes lp
ON r.location_prefix_id = lp.id
LEFT JOIN rdatapaths rp
ON rp.resource_id = r.id
WHERE (rp.rdataclass = 'EnsDb' OR r.title LIKE 'Ensembl % EnsDb%')
AND rp.rdatapath LIKE '%.sqlite'
ORDER BY r.rdatadateadded DESC;
"""
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
finally:
conn.close()
for row in rows:
record = EnsDbRecord.from_db_row(row)
self._registry_map[record.ensdb_id] = record
[docs]
def list_ensdbs(self) -> List[str]:
"""List available EnsDb IDs."""
return sorted(list(self._registry_map.keys()))
[docs]
def get_record(self, ensdb_id: str) -> EnsDbRecord:
if ensdb_id not in self._registry_map:
raise KeyError(f"ID '{ensdb_id}' not found.")
return self._registry_map[ensdb_id]
[docs]
def download(self, ensdb_id: str, force: bool = False) -> str:
record = self.get_record(ensdb_id)
url = record.url
key = ensdb_id
if force:
try:
self._bfc.remove(key)
except Exception:
pass
if not force:
try:
existing = self._bfc.get(key)
if existing:
path = self._get_filepath(existing)
if path and os.path.exists(path) and os.path.getsize(path) > 0:
return path
except Exception:
pass
resource = self._bfc.add(
key,
url,
rtype="web",
download=True,
)
path = self._get_filepath(resource)
if not path or not os.path.exists(path) or os.path.getsize(path) == 0:
try:
self._bfc.remove(key)
except Exception:
pass
raise RuntimeError(f"Download failed for {ensdb_id}.")
return path
[docs]
def load_db(self, ensdb_id: str, force: bool = False) -> EnsDb:
path = self.download(ensdb_id, force=force)
return EnsDb(path)
def _get_filepath(self, resource: Any) -> Optional[str]:
if hasattr(resource, "rpath"):
rel_path = str(resource.rpath)
elif hasattr(resource, "get"):
rel_path = str(resource.get("rpath"))
else:
return None
return str(self._cache_dir / rel_path)