Skip to content

Commit 6598890

Browse files
committed
feat: Implement get_overlapping_features_for_region function
This function queries the Ensembl API with exponential backoff as needed, returning a list of features which overlap the passed region.
1 parent fd2b69d commit 6598890

File tree

5 files changed

+444
-0
lines changed

5 files changed

+444
-0
lines changed

settings/.env.dev

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,9 @@ MAVEDB_API_KEY=
3131
####################################################################################################
3232

3333
SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20
34+
35+
####################################################################################################
36+
# Environment variables for ensembl
37+
####################################################################################################
38+
39+
ENSEMBL_API_URL=https://rest.ensembl.org

src/dcd_mapping/lookup.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import logging
1212
import os
1313
from pathlib import Path
14+
from typing import Any
1415

1516
import hgvs
1617
import polars as pl
@@ -50,6 +51,7 @@
5051
from gene.schemas import MatchType, SourceName
5152

5253
from dcd_mapping.exceptions import DataLookupError
54+
from dcd_mapping.resource_utils import ENSEMBL_API_URL, request_with_backoff
5355
from dcd_mapping.schemas import (
5456
GeneLocation,
5557
ManeDescription,
@@ -645,6 +647,68 @@ def _sort_mane_result(description: ManeDescription) -> int:
645647
return mane_data
646648

647649

650+
# --------------------------------- Ensembl --------------------------------- #
651+
652+
653+
def get_overlapping_features_for_region(
654+
chromosome: str, start: int, end: int, features: list[str] | None = None
655+
) -> list[dict[str, Any]]:
656+
"""Get genes overlapping a specific genomic region.
657+
658+
:param chromosome: Chromosome identifier
659+
:param start: Start position of the region
660+
:param end: End position of the region
661+
:param features: List of features to retrieve (default is ["gene"])
662+
:return: List of overlapping gene symbols
663+
"""
664+
if not features:
665+
features = ["gene"]
666+
_logger.debug("No features specified, defaulting to %s", features)
667+
668+
chrom = get_chromosome_identifier(chromosome)
669+
670+
query = f"/{chrom}:{start}-{end}"
671+
if features:
672+
query += "?"
673+
for feature in features:
674+
query += f"feature={feature};"
675+
676+
try:
677+
_logger.debug(
678+
"Fetching overlapping features for region %s:%d-%d with features %s",
679+
chromosome,
680+
start,
681+
end,
682+
features,
683+
)
684+
685+
url = f"{ENSEMBL_API_URL}/overlap/region/human{query}"
686+
response = request_with_backoff(
687+
url, headers={"Content-Type": "application/json"}
688+
)
689+
response.raise_for_status()
690+
except requests.RequestException as e:
691+
_logger.error(
692+
"Failed to fetch overlapping features for region %s-%s on chromosome %s: %s",
693+
start,
694+
end,
695+
chromosome,
696+
e,
697+
)
698+
return []
699+
700+
overlapping_features = response.json()
701+
_logger.debug(
702+
"Successfully fetched %d overlapping features for region %s:%d-%d with features %s",
703+
len(overlapping_features),
704+
chromosome,
705+
start,
706+
end,
707+
features,
708+
)
709+
return overlapping_features
710+
711+
648712
# ---------------------------------- Misc. ---------------------------------- #
649713

650714

src/dcd_mapping/resource_utils.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Provide basic utilities for fetching and storing external data."""
22
import os
3+
import time
34
from pathlib import Path
45

56
import click
@@ -8,6 +9,7 @@
89

910
MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY")
1011
MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL")
12+
ENSEMBL_API_URL = os.environ.get("ENSEMBL_API_URL", "https://rest.ensembl.org") # TODO
1113

1214
LOCAL_STORE_PATH = Path(
1315
os.environ.get(
@@ -57,3 +59,67 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path:
5759
if chunk:
5860
h.write(chunk)
5961
return out_path
62+
63+
64+
def request_with_backoff(
65+
url: str, max_retries: int = 5, backoff_factor: float = 0.3, **kwargs
66+
) -> requests.Response:
67+
"""HTTP GET with exponential backoff only for retryable errors.
68+
69+
Retries on:
70+
- Connection timeout or connection errors
71+
- HTTP 5xx server errors
72+
- HTTP 429 rate limiting (respecting Retry-After when present)
73+
74+
Immediately raises on other HTTP errors (e.g., 4xx client errors).
75+
"""
76+
attempt = 0
77+
while attempt < max_retries:
78+
try:
79+
response = requests.get(url, timeout=60, **kwargs)
80+
except (requests.Timeout, requests.ConnectionError):
81+
# Retry on transient network failures
82+
if attempt == max_retries - 1:
83+
raise
84+
sleep_time = backoff_factor * (2**attempt)
85+
time.sleep(sleep_time)
86+
attempt += 1
87+
continue
88+
89+
# If we have a response, decide retry based on status code
90+
status = response.status_code
91+
if 200 <= status < 300:
92+
return response
93+
94+
# 429: Too Many Requests — optionally use Retry-After
95+
if status == 429:
96+
if attempt == max_retries - 1:
97+
response.raise_for_status()
98+
retry_after = response.headers.get("Retry-After")
99+
try:
100+
sleep_time = (
101+
float(retry_after)
102+
if retry_after is not None
103+
else backoff_factor * (2**attempt)
104+
)
105+
except ValueError:
106+
sleep_time = backoff_factor * (2**attempt)
107+
time.sleep(sleep_time)
108+
attempt += 1
109+
continue
110+
111+
# 5xx: server errors — retry
112+
if 500 <= status < 600:
113+
if attempt == max_retries - 1:
114+
response.raise_for_status()
115+
sleep_time = backoff_factor * (2**attempt)
116+
time.sleep(sleep_time)
117+
attempt += 1
118+
continue
119+
120+
# Non-retryable (e.g., 4xx other than 429): raise immediately
121+
response.raise_for_status()
122+
123+
# Exhausted retries without success
124+
msg = f"Failed to fetch {url} after {max_retries} attempts"
125+
raise Exception(msg)

tests/test_lookup.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
"""Tests for dcd_mapping.lookup"""
2+
3+
from unittest.mock import patch
4+
5+
import requests
6+
7+
from dcd_mapping.lookup import get_overlapping_features_for_region
8+
9+
RAW_OVERLAP_RESPONSE = [
10+
{
11+
"seq_region_name": "22",
12+
"version": 1,
13+
"biotype": "protein_coding",
14+
"feature_type": "gene",
15+
"description": "novel transcript",
16+
"logic_name": "havana_homo_sapiens",
17+
"start": 19717220,
18+
"id": "ENSG00000284874",
19+
"source": "havana",
20+
"canonical_transcript": "ENST00000455843.5",
21+
"assembly_name": "GRCh38",
22+
"end": 19724772,
23+
"gene_id": "ENSG00000284874",
24+
"strand": 1,
25+
},
26+
{
27+
"source": "ensembl_havana",
28+
"canonical_transcript": "ENST00000366425.4",
29+
"assembly_name": "GRCh38",
30+
"end": 19724776,
31+
"gene_id": "ENSG00000203618",
32+
"strand": 1,
33+
"external_name": "GP1BB",
34+
"seq_region_name": "22",
35+
"version": 7,
36+
"biotype": "protein_coding",
37+
"logic_name": "ensembl_havana_gene_homo_sapiens",
38+
"feature_type": "gene",
39+
"start": 19723539,
40+
"description": "glycoprotein Ib platelet subunit beta [Source:HGNC Symbol;Acc:HGNC:4440]",
41+
"id": "ENSG00000203618",
42+
},
43+
{
44+
"end": 19724224,
45+
"gene_id": "ENSG00000184702",
46+
"strand": 1,
47+
"canonical_transcript": "ENST00000455784.7",
48+
"source": "ensembl_havana",
49+
"assembly_name": "GRCh38",
50+
"seq_region_name": "22",
51+
"version": 21,
52+
"biotype": "protein_coding",
53+
"description": "septin 5 [Source:HGNC Symbol;Acc:HGNC:9164]",
54+
"feature_type": "gene",
55+
"logic_name": "ensembl_havana_gene_homo_sapiens",
56+
"start": 19714467,
57+
"id": "ENSG00000184702",
58+
"external_name": "SEPTIN5",
59+
},
60+
]
61+
62+
63+
class _FakeResponse:
64+
def __init__(self, data):
65+
self._data = data
66+
self.status_code = 200
67+
68+
def json(self):
69+
return self._data
70+
71+
def raise_for_status(self):
72+
return None
73+
74+
75+
def test_get_overlapping_features_for_region_success():
76+
with (
77+
patch(
78+
"dcd_mapping.lookup.request_with_backoff",
79+
return_value=_FakeResponse(RAW_OVERLAP_RESPONSE),
80+
),
81+
patch("dcd_mapping.lookup.get_chromosome_identifier", side_effect=lambda c: c),
82+
):
83+
result = get_overlapping_features_for_region(
84+
"NC_000022.11", 19714000, 19725000, features=["gene"]
85+
)
86+
assert isinstance(result, list)
87+
assert result == RAW_OVERLAP_RESPONSE
88+
89+
90+
def test_get_overlapping_features_for_region_error():
91+
class ErrorResponse(_FakeResponse):
92+
def __init__(self):
93+
super().__init__(None)
94+
self.status_code = 500
95+
96+
def raise_for_status(self):
97+
msg = f"HTTP {self.status_code} Error"
98+
raise requests.RequestException(msg)
99+
100+
with (
101+
patch("dcd_mapping.lookup.request_with_backoff", return_value=ErrorResponse()),
102+
patch("dcd_mapping.lookup.get_chromosome_identifier", side_effect=lambda c: c),
103+
):
104+
result = get_overlapping_features_for_region(
105+
"NC_000022.11", 19714000, 19725000, features=["gene"]
106+
)
107+
assert result == []

0 commit comments

Comments
 (0)