Skip to content

Commit 085d011

Browse files
authored
Merge pull request #38 from VariantEffect/accession-based
Multi-target and accession-based mapping
2 parents c9a4ca3 + 37c504e commit 085d011

21 files changed

+1989
-1124
lines changed

Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ RUN curl -L https://github.com/samtools/htslib/releases/download/${htsversion}/h
3737
curl -L https://github.com/samtools/bcftools/releases/download/${htsversion}/bcftools-${htsversion}.tar.bz2 | tar xj && \
3838
(cd bcftools-${htsversion} && ./configure --enable-libgsl --enable-perl-filters --with-htslib=system && make install)
3939

40+
# Fetch and index GRCh37 and GRCh38 assemblies for cdot
41+
RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz | gzip -d | bgzip > GCF_000001405.25_GRCh37.p13_genomic.fna.gz
42+
RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz | gzip -d | bgzip > GCF_000001405.39_GRCh38.p13_genomic.fna.gz
43+
RUN samtools faidx GCF_000001405.25_GRCh37.p13_genomic.fna.gz
44+
RUN samtools faidx GCF_000001405.39_GRCh38.p13_genomic.fna.gz
45+
4046
RUN mkdir /usr/src/app
4147
WORKDIR /usr/src/app
4248
COPY . .

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ dependencies = [
3535
"requests",
3636
"biopython",
3737
"tqdm",
38+
"cdot",
3839
"click",
3940
"cool-seq-tool==0.4.0.dev3",
4041
"ga4gh.vrs==2.0.0-a6",

settings/.env.dev

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ POSTGRES_DB=gene_normalizer
1717
# Environment variables for UTA connection via CoolSeqTool
1818
####################################################################################################
1919

20-
UTA_DB_URL=postgresql://anonymous:[email protected]:5432/uta/uta_20180821
20+
UTA_DB_URL=postgresql://anonymous:[email protected]:5432/uta/uta_20241220
2121

2222
####################################################################################################
2323
# Environment variables for MaveDB connection

src/api/routers/map.py

Lines changed: 130 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from fastapi.responses import JSONResponse
77
from requests import HTTPError
88

9-
from dcd_mapping.align import AlignmentError, BlatNotFoundError, align
9+
from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result
1010
from dcd_mapping.annotate import (
1111
_get_computed_reference_sequence,
1212
_get_mapped_reference_sequence,
@@ -22,8 +22,14 @@
2222
with_mavedb_score_set,
2323
)
2424
from dcd_mapping.resource_utils import ResourceAcquisitionError
25-
from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion
26-
from dcd_mapping.transcripts import TxSelectError, select_transcript
25+
from dcd_mapping.schemas import (
26+
ScoreAnnotation,
27+
ScoresetMapping,
28+
TargetType,
29+
TxSelectResult,
30+
VrsVersion,
31+
)
32+
from dcd_mapping.transcripts import select_transcripts
2733
from dcd_mapping.vrs_map import VrsMapError, vrs_map
2834

2935
router = APIRouter(
@@ -33,21 +39,21 @@
3339

3440
@router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping)
3541
@with_mavedb_score_set
36-
async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapping:
42+
async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse:
3743
"""Perform end-to-end mapping for a scoreset.
3844
3945
:param urn: identifier for a scoreset.
40-
:param output_path: optional path to save output at
41-
:param vrs_version: version of VRS objects to output (1.3 or 2)
42-
:param silent: if True, suppress console information output
46+
:param store_path: optional path to save output at
4347
"""
4448
try:
4549
metadata = get_scoreset_metadata(urn, store_path)
46-
records = get_scoreset_records(urn, True, store_path)
50+
records = get_scoreset_records(metadata, True, store_path)
4751
except ScoresetNotSupportedError as e:
48-
return ScoresetMapping(
49-
metadata=None,
50-
error_message=str(e).strip("'"),
52+
return JSONResponse(
53+
content=ScoresetMapping(
54+
metadata=None,
55+
error_message=str(e).strip("'"),
56+
).model_dump(exclude_none=True)
5157
)
5258
except ResourceAcquisitionError as e:
5359
msg = f"Unable to acquire resource from MaveDB: {e}"
@@ -62,7 +68,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
6268
)
6369

6470
try:
65-
alignment_result = align(metadata, True)
71+
alignment_results = build_alignment_result(metadata, True)
6672
except BlatNotFoundError as e:
6773
msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more."
6874
raise HTTPException(status_code=500, detail=msg) from e
@@ -75,77 +81,151 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
7581
metadata=metadata, error_message=str(e).strip("'")
7682
).model_dump(exclude_none=True)
7783
)
78-
79-
try:
80-
transcript = await select_transcript(metadata, records, alignment_result)
81-
except (TxSelectError, KeyError, ValueError) as e:
84+
except ScoresetNotSupportedError as e:
8285
return JSONResponse(
8386
content=ScoresetMapping(
8487
metadata=metadata, error_message=str(e).strip("'")
8588
).model_dump(exclude_none=True)
8689
)
90+
91+
try:
92+
transcripts = await select_transcripts(metadata, records, alignment_results)
93+
# NOTE: transcript selection errors are handled in select_transcripts,
94+
# and they do not cause the entire mapping process to exit; instead, an error will be reported
95+
# on the target level and on the variant level for variants relative to that target
96+
# HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate
97+
# underlying issues with data providers.
8798
except HTTPError as e:
8899
msg = f"HTTP error occurred during transcript selection: {e}"
89100
raise HTTPException(status_code=500, detail=msg) from e
90101
except DataLookupError as e:
91102
msg = f"Data lookup error occurred during transcript selection: {e}"
92103
raise HTTPException(status_code=500, detail=msg) from e
93104

105+
vrs_results = {}
94106
try:
95-
vrs_results = vrs_map(metadata, alignment_result, records, transcript, True)
107+
for target_gene in metadata.target_genes:
108+
vrs_results[target_gene] = vrs_map(
109+
metadata=metadata.target_genes[target_gene],
110+
align_result=alignment_results[target_gene],
111+
records=records[target_gene],
112+
transcript=transcripts[target_gene],
113+
silent=True,
114+
)
96115
except VrsMapError as e:
97116
return JSONResponse(
98117
content=ScoresetMapping(
99118
metadata=metadata, error_message=str(e).strip("'")
100119
).model_dump(exclude_none=True)
101120
)
102-
if vrs_results is None:
103-
return ScoresetMapping(
104-
metadata=metadata,
105-
error_message="No variant mappings available for this score set",
121+
if not vrs_results or all(
122+
mapping_result is None for mapping_result in vrs_results.values()
123+
):
124+
return JSONResponse(
125+
content=ScoresetMapping(
126+
metadata=metadata,
127+
error_message="No variant mappings available for this score set",
128+
).model_dump(exclude_none=True)
106129
)
107130

131+
annotated_vrs_results = {}
108132
try:
109-
vrs_results = annotate(vrs_results, transcript, metadata, VrsVersion.V_2)
133+
for target_gene in vrs_results:
134+
annotated_vrs_results[target_gene] = annotate(
135+
vrs_results[target_gene],
136+
transcripts[target_gene],
137+
metadata.target_genes[target_gene],
138+
metadata.urn,
139+
VrsVersion.V_2,
140+
)
110141
except Exception as e:
111142
return JSONResponse(
112143
content=ScoresetMapping(
113144
metadata=metadata, error_message=str(e).strip("'")
114145
).model_dump(exclude_none=True)
115146
)
116-
if vrs_results is None:
117-
return ScoresetMapping(
118-
metadata=metadata,
119-
error_message="No annotated variant mappings available for this score set",
147+
if not annotated_vrs_results or all(
148+
mapping_result is None for mapping_result in annotated_vrs_results.values()
149+
):
150+
return JSONResponse(
151+
content=ScoresetMapping(
152+
metadata=metadata,
153+
error_message="No annotated variant mappings available for this score set",
154+
).model_dump(exclude_none=True)
120155
)
121156

122157
try:
123158
raw_metadata = get_raw_scoreset_metadata(urn, store_path)
124-
preferred_layers = {
125-
_set_scoreset_layer(urn, vrs_results),
126-
}
127-
128-
reference_sequences = {
129-
layer: {
130-
"computed_reference_sequence": None,
131-
"mapped_reference_sequence": None,
159+
reference_sequences: dict[str, dict] = {}
160+
mapped_scores: list[ScoreAnnotation] = []
161+
for target_gene in annotated_vrs_results:
162+
preferred_layers = {
163+
_set_scoreset_layer(urn, annotated_vrs_results[target_gene]),
132164
}
133-
for layer in AnnotationLayer
134-
}
135-
136-
for layer in preferred_layers:
137-
reference_sequences[layer][
138-
"computed_reference_sequence"
139-
] = _get_computed_reference_sequence(metadata, layer, transcript)
140-
reference_sequences[layer][
141-
"mapped_reference_sequence"
142-
] = _get_mapped_reference_sequence(layer, transcript, alignment_result)
165+
target_gene_name = metadata.target_genes[target_gene].target_gene_name
166+
reference_sequences[target_gene_name] = {
167+
layer: {
168+
"computed_reference_sequence": None,
169+
"mapped_reference_sequence": None,
170+
}
171+
for layer in preferred_layers
172+
}
173+
# sometimes Nonetype layers show up in preferred layers dict; remove these
174+
preferred_layers.discard(None)
175+
for layer in preferred_layers:
176+
reference_sequences[target_gene_name][layer][
177+
"computed_reference_sequence"
178+
] = _get_computed_reference_sequence(
179+
metadata.target_genes[target_gene], layer, transcripts[target_gene]
180+
)
181+
reference_sequences[target_gene_name][layer][
182+
"mapped_reference_sequence"
183+
] = _get_mapped_reference_sequence(
184+
metadata.target_genes[target_gene],
185+
layer,
186+
transcripts[target_gene],
187+
alignment_results[target_gene],
188+
)
189+
190+
for m in annotated_vrs_results[target_gene]:
191+
if m.pre_mapped is None:
192+
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
193+
elif m.annotation_layer in preferred_layers:
194+
# drop annotation layer from mapping object
195+
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
196+
197+
# drop Nonetype reference sequences
198+
for target_gene in reference_sequences:
199+
for layer in list(reference_sequences[target_gene].keys()):
200+
if (
201+
reference_sequences[target_gene][layer][
202+
"mapped_reference_sequence"
203+
]
204+
is None
205+
and reference_sequences[target_gene][layer][
206+
"computed_reference_sequence"
207+
]
208+
is None
209+
) or layer is None:
210+
del reference_sequences[target_gene][layer]
211+
212+
# if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict
213+
if (
214+
AnnotationLayer.GENOMIC in reference_sequences[target_gene_name]
215+
and metadata.target_genes[target_gene].target_gene_category
216+
== TargetType.PROTEIN_CODING
217+
and metadata.target_genes[target_gene].target_accession_id is None
218+
and transcripts[target_gene] is not None
219+
and isinstance(transcripts[target_gene], TxSelectResult)
220+
and transcripts[target_gene].nm is not None
221+
):
222+
reference_sequences[target_gene_name][AnnotationLayer.CDNA] = {
223+
"computed_reference_sequence": None,
224+
"mapped_reference_sequence": {
225+
"sequence_accessions": [transcripts[target_gene].nm]
226+
},
227+
}
143228

144-
mapped_scores: list[ScoreAnnotation] = []
145-
for m in vrs_results:
146-
if m.annotation_layer in preferred_layers:
147-
# drop annotation layer from mapping object
148-
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
149229
except Exception as e:
150230
return JSONResponse(
151231
content=ScoresetMapping(
@@ -156,18 +236,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
156236
return JSONResponse(
157237
content=ScoresetMapping(
158238
metadata=raw_metadata,
159-
computed_protein_reference_sequence=reference_sequences[
160-
AnnotationLayer.PROTEIN
161-
]["computed_reference_sequence"],
162-
mapped_protein_reference_sequence=reference_sequences[
163-
AnnotationLayer.PROTEIN
164-
]["mapped_reference_sequence"],
165-
computed_genomic_reference_sequence=reference_sequences[
166-
AnnotationLayer.GENOMIC
167-
]["computed_reference_sequence"],
168-
mapped_genomic_reference_sequence=reference_sequences[
169-
AnnotationLayer.GENOMIC
170-
]["mapped_reference_sequence"],
239+
reference_sequences=reference_sequences,
171240
mapped_scores=mapped_scores,
172241
).model_dump(exclude_none=True)
173242
)

0 commit comments

Comments
 (0)