Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
c8a31e7
Multi-target mapping partial draft: alignment and transcript selection
sallybg Feb 6, 2025
0b68f5a
Make it the default behavior to prefer genomic mappings when available
sallybg Feb 13, 2025
4ed6cb9
Map score sets with multiple targets
sallybg Feb 13, 2025
d94d55c
fix: single-target blat result named incorrectly
sallybg Mar 18, 2025
bfcbcba
Remove blank reference sequence entries from output
sallybg Mar 18, 2025
20e3aa2
Fix type annotations for target gene metadata param
sallybg Mar 18, 2025
8fb8e0b
Accession-based mapping and add cdot to environment
sallybg Apr 8, 2025
25f4332
API support for accession-based mapping
sallybg Apr 9, 2025
f8586ce
Temporarily remove support for multi-target score sets
sallybg Apr 9, 2025
d38f364
Corrections for accession-based mapping without multi-target mapping
sallybg Apr 11, 2025
5869536
Bug fixes for temporary non-multi-target staging deployment
sallybg May 14, 2025
4cdc7a3
Update UTA DB version
sallybg May 15, 2025
ccec5a5
Re-implement multi-target mapping
sallybg May 15, 2025
a44bd2e
Add transcript information to mapped metadata for genomic score sets
sallybg May 22, 2025
5a825e3
Fix CLI prefer_genomic flag
sallybg May 22, 2025
1b1ab48
Change tests and fixtures to reflect multi-target mapper changes
sallybg Jun 3, 2025
c3ade2e
Always return a custom JSON response type from mapping api
sallybg Jun 3, 2025
e4b41b2
CLI: Check that number of mappings for a score set is greater than 0,…
sallybg Jun 3, 2025
20e604d
Remove TODO comments for which backlog entries have been created
sallybg Jun 3, 2025
dc7c48e
Fix BLAT-incompatible target names in multi-target score sets
sallybg Jun 3, 2025
a59c560
Bump mapper version
sallybg Jun 3, 2025
bb04c51
Use https for mock requests
sallybg Jun 4, 2025
37c504e
Mock cdot data provider for tests
sallybg Jun 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ RUN curl -L https://github.com/samtools/htslib/releases/download/${htsversion}/h
curl -L https://github.com/samtools/bcftools/releases/download/${htsversion}/bcftools-${htsversion}.tar.bz2 | tar xj && \
(cd bcftools-${htsversion} && ./configure --enable-libgsl --enable-perl-filters --with-htslib=system && make install)

# Fetch and index GRCh37 and GRCh38 assemblies for cdot
RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz | gzip -d | bgzip > GCF_000001405.25_GRCh37.p13_genomic.fna.gz
RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz | gzip -d | bgzip > GCF_000001405.39_GRCh38.p13_genomic.fna.gz
RUN samtools faidx GCF_000001405.25_GRCh37.p13_genomic.fna.gz
RUN samtools faidx GCF_000001405.39_GRCh38.p13_genomic.fna.gz

RUN mkdir /usr/src/app
WORKDIR /usr/src/app
COPY . .
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies = [
"requests",
"biopython",
"tqdm",
"cdot",
"click",
"cool-seq-tool==0.4.0.dev3",
"ga4gh.vrs==2.0.0-a6",
Expand Down
2 changes: 1 addition & 1 deletion settings/.env.dev
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ POSTGRES_DB=gene_normalizer
# Environment variables for UTA connection via CoolSeqTool
####################################################################################################

UTA_DB_URL=postgresql://anonymous:[email protected]:5432/uta/uta_20180821
UTA_DB_URL=postgresql://anonymous:[email protected]:5432/uta/uta_20241220

####################################################################################################
# Environment variables for MaveDB connection
Expand Down
191 changes: 130 additions & 61 deletions src/api/routers/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from fastapi.responses import JSONResponse
from requests import HTTPError

from dcd_mapping.align import AlignmentError, BlatNotFoundError, align
from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result
from dcd_mapping.annotate import (
_get_computed_reference_sequence,
_get_mapped_reference_sequence,
Expand All @@ -22,8 +22,14 @@
with_mavedb_score_set,
)
from dcd_mapping.resource_utils import ResourceAcquisitionError
from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion
from dcd_mapping.transcripts import TxSelectError, select_transcript
from dcd_mapping.schemas import (
ScoreAnnotation,
ScoresetMapping,
TargetType,
TxSelectResult,
VrsVersion,
)
from dcd_mapping.transcripts import select_transcripts
from dcd_mapping.vrs_map import VrsMapError, vrs_map

router = APIRouter(
Expand All @@ -33,21 +39,21 @@

@router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping)
@with_mavedb_score_set
async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapping:
async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse:
"""Perform end-to-end mapping for a scoreset.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed this function is a little inconsistent with its return types. Sometimes it returns a ScoresetMapping, and other times it returns a JSONResponse containing a ScoresetMapping. We should probably make that consistent and choose one or the other.


:param urn: identifier for a scoreset.
:param output_path: optional path to save output at
:param vrs_version: version of VRS objects to output (1.3 or 2)
:param silent: if True, suppress console information output
:param store_path: optional path to save output at
"""
try:
metadata = get_scoreset_metadata(urn, store_path)
records = get_scoreset_records(urn, True, store_path)
records = get_scoreset_records(metadata, True, store_path)
except ScoresetNotSupportedError as e:
return ScoresetMapping(
metadata=None,
error_message=str(e).strip("'"),
return JSONResponse(
content=ScoresetMapping(
metadata=None,
error_message=str(e).strip("'"),
).model_dump(exclude_none=True)
)
except ResourceAcquisitionError as e:
msg = f"Unable to acquire resource from MaveDB: {e}"
Expand All @@ -62,7 +68,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
)

try:
alignment_result = align(metadata, True)
alignment_results = build_alignment_result(metadata, True)
except BlatNotFoundError as e:
msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more."
raise HTTPException(status_code=500, detail=msg) from e
Expand All @@ -75,77 +81,151 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
metadata=metadata, error_message=str(e).strip("'")
).model_dump(exclude_none=True)
)

try:
transcript = await select_transcript(metadata, records, alignment_result)
except (TxSelectError, KeyError, ValueError) as e:
except ScoresetNotSupportedError as e:
return JSONResponse(
content=ScoresetMapping(
metadata=metadata, error_message=str(e).strip("'")
).model_dump(exclude_none=True)
)

try:
transcripts = await select_transcripts(metadata, records, alignment_results)
# NOTE: transcript selection errors are handled in select_transcripts,
# and they do not cause the entire mapping process to exit; instead, an error will be reported
# on the target level and on the variant level for variants relative to that target
# HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate
# underlying issues with data providers.
except HTTPError as e:
msg = f"HTTP error occurred during transcript selection: {e}"
raise HTTPException(status_code=500, detail=msg) from e
except DataLookupError as e:
msg = f"Data lookup error occurred during transcript selection: {e}"
raise HTTPException(status_code=500, detail=msg) from e

vrs_results = {}
try:
vrs_results = vrs_map(metadata, alignment_result, records, transcript, True)
for target_gene in metadata.target_genes:
vrs_results[target_gene] = vrs_map(
metadata=metadata.target_genes[target_gene],
align_result=alignment_results[target_gene],
records=records[target_gene],
transcript=transcripts[target_gene],
silent=True,
)
except VrsMapError as e:
return JSONResponse(
content=ScoresetMapping(
metadata=metadata, error_message=str(e).strip("'")
).model_dump(exclude_none=True)
)
if vrs_results is None:
return ScoresetMapping(
metadata=metadata,
error_message="No variant mappings available for this score set",
if not vrs_results or all(
mapping_result is None for mapping_result in vrs_results.values()
):
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message="No variant mappings available for this score set",
).model_dump(exclude_none=True)
)

annotated_vrs_results = {}
try:
vrs_results = annotate(vrs_results, transcript, metadata, VrsVersion.V_2)
for target_gene in vrs_results:
annotated_vrs_results[target_gene] = annotate(
vrs_results[target_gene],
transcripts[target_gene],
metadata.target_genes[target_gene],
metadata.urn,
VrsVersion.V_2,
)
except Exception as e:
return JSONResponse(
content=ScoresetMapping(
metadata=metadata, error_message=str(e).strip("'")
).model_dump(exclude_none=True)
)
if vrs_results is None:
return ScoresetMapping(
metadata=metadata,
error_message="No annotated variant mappings available for this score set",
if not annotated_vrs_results or all(
mapping_result is None for mapping_result in annotated_vrs_results.values()
):
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message="No annotated variant mappings available for this score set",
).model_dump(exclude_none=True)
)

try:
raw_metadata = get_raw_scoreset_metadata(urn, store_path)
preferred_layers = {
_set_scoreset_layer(urn, vrs_results),
}

reference_sequences = {
layer: {
"computed_reference_sequence": None,
"mapped_reference_sequence": None,
reference_sequences: dict[str, dict] = {}
mapped_scores: list[ScoreAnnotation] = []
for target_gene in annotated_vrs_results:
preferred_layers = {
_set_scoreset_layer(urn, annotated_vrs_results[target_gene]),
}
for layer in AnnotationLayer
}

for layer in preferred_layers:
reference_sequences[layer][
"computed_reference_sequence"
] = _get_computed_reference_sequence(metadata, layer, transcript)
reference_sequences[layer][
"mapped_reference_sequence"
] = _get_mapped_reference_sequence(layer, transcript, alignment_result)
target_gene_name = metadata.target_genes[target_gene].target_gene_name
reference_sequences[target_gene_name] = {
layer: {
"computed_reference_sequence": None,
"mapped_reference_sequence": None,
}
for layer in preferred_layers
}
# sometimes Nonetype layers show up in preferred layers dict; remove these
preferred_layers.discard(None)
for layer in preferred_layers:
reference_sequences[target_gene_name][layer][
"computed_reference_sequence"
] = _get_computed_reference_sequence(
metadata.target_genes[target_gene], layer, transcripts[target_gene]
)
reference_sequences[target_gene_name][layer][
"mapped_reference_sequence"
] = _get_mapped_reference_sequence(
metadata.target_genes[target_gene],
layer,
transcripts[target_gene],
alignment_results[target_gene],
)

for m in annotated_vrs_results[target_gene]:
if m.pre_mapped is None:
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
elif m.annotation_layer in preferred_layers:
# drop annotation layer from mapping object
mapped_scores.append(ScoreAnnotation(**m.model_dump()))

# drop Nonetype reference sequences
for target_gene in reference_sequences:
for layer in list(reference_sequences[target_gene].keys()):
if (
reference_sequences[target_gene][layer][
"mapped_reference_sequence"
]
is None
and reference_sequences[target_gene][layer][
"computed_reference_sequence"
]
is None
) or layer is None:
del reference_sequences[target_gene][layer]

# if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict
if (
AnnotationLayer.GENOMIC in reference_sequences[target_gene_name]
and metadata.target_genes[target_gene].target_gene_category
== TargetType.PROTEIN_CODING
and metadata.target_genes[target_gene].target_accession_id is None
and transcripts[target_gene] is not None
and isinstance(transcripts[target_gene], TxSelectResult)
and transcripts[target_gene].nm is not None
):
reference_sequences[target_gene_name][AnnotationLayer.CDNA] = {
"computed_reference_sequence": None,
"mapped_reference_sequence": {
"sequence_accessions": [transcripts[target_gene].nm]
},
}

mapped_scores: list[ScoreAnnotation] = []
for m in vrs_results:
if m.annotation_layer in preferred_layers:
# drop annotation layer from mapping object
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
except Exception as e:
return JSONResponse(
content=ScoresetMapping(
Expand All @@ -156,18 +236,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
return JSONResponse(
content=ScoresetMapping(
metadata=raw_metadata,
computed_protein_reference_sequence=reference_sequences[
AnnotationLayer.PROTEIN
]["computed_reference_sequence"],
mapped_protein_reference_sequence=reference_sequences[
AnnotationLayer.PROTEIN
]["mapped_reference_sequence"],
computed_genomic_reference_sequence=reference_sequences[
AnnotationLayer.GENOMIC
]["computed_reference_sequence"],
mapped_genomic_reference_sequence=reference_sequences[
AnnotationLayer.GENOMIC
]["mapped_reference_sequence"],
reference_sequences=reference_sequences,
mapped_scores=mapped_scores,
).model_dump(exclude_none=True)
)
Loading