Skip to content

Commit 8fb8e0b

Browse files
committed
Accession-based mapping and add cdot to environment
1 parent 20e3aa2 commit 8fb8e0b

File tree

10 files changed

+522
-138
lines changed

10 files changed

+522
-138
lines changed

Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ RUN curl -L https://github.com/samtools/htslib/releases/download/${htsversion}/h
3737
curl -L https://github.com/samtools/bcftools/releases/download/${htsversion}/bcftools-${htsversion}.tar.bz2 | tar xj && \
3838
(cd bcftools-${htsversion} && ./configure --enable-libgsl --enable-perl-filters --with-htslib=system && make install)
3939

40+
# Fetch and index GRCh37 and GRCh38 assemblies for cdot
41+
RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz | gzip -d | bgzip > GCF_000001405.25_GRCh37.p13_genomic.fna.gz
42+
RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz | gzip -d | bgzip > GCF_000001405.39_GRCh38.p13_genomic.fna.gz
43+
RUN samtools faidx GCF_000001405.25_GRCh37.p13_genomic.fna.gz
44+
RUN samtools faidx GCF_000001405.39_GRCh38.p13_genomic.fna.gz
45+
4046
RUN mkdir /usr/src/app
4147
WORKDIR /usr/src/app
4248
COPY . .

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ dependencies = [
3535
"requests",
3636
"biopython",
3737
"tqdm",
38+
"cdot",
3839
"click",
3940
"cool-seq-tool==0.4.0.dev3",
4041
"ga4gh.vrs==2.0.0-a6",

src/dcd_mapping/align.py

Lines changed: 133 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,7 @@
1414
from cool_seq_tool.schemas import Strand
1515

1616
from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
17-
from dcd_mapping.mavedb_data import (
18-
LOCAL_STORE_PATH,
19-
)
17+
from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, ScoresetNotSupportedError
2018
from dcd_mapping.resource_utils import (
2119
ResourceAcquisitionError,
2220
http_download,
@@ -180,36 +178,35 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa:
180178
:return: dict where keys are target gene identifiers and values are BLAT query result objects
181179
:raise AlignmentError: if BLAT subprocess returns error code
182180
"""
183-
return parse_blat(f"{metadata.urn}_blat.psl", "blat-psl")
184-
# with tempfile.NamedTemporaryFile() as tmp_file:
185-
# query_file = _build_query_file(metadata, Path(tmp_file.name))
186-
# target_sequence_type = _get_target_sequence_type(metadata)
187-
# if target_sequence_type == TargetSequenceType.PROTEIN:
188-
# target_args = "-q=prot -t=dnax"
189-
# elif target_sequence_type == TargetSequenceType.DNA:
190-
# target_args = ""
191-
# else:
192-
# # TODO implement support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments.
193-
# msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported."
194-
# raise NotImplementedError(msg)
195-
# process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
196-
# out_file = _write_blat_output_tempfile(process_result)
197-
198-
# try:
199-
# output = parse_blat(out_file, "blat-psl")
200-
201-
# # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied?
202-
# except ValueError:
203-
# target_args = "-q=dnax -t=dnax"
204-
# process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
205-
# out_file = _write_blat_output_tempfile(process_result)
206-
# try:
207-
# output = parse_blat(out_file, "blat-psl")
208-
# except ValueError as e:
209-
# msg = f"Unable to run successful BLAT on {metadata.urn}"
210-
# raise AlignmentError(msg) from e
211-
212-
# return output
181+
with tempfile.NamedTemporaryFile() as tmp_file:
182+
query_file = _build_query_file(metadata, Path(tmp_file.name))
183+
target_sequence_type = _get_target_sequence_type(metadata)
184+
if target_sequence_type == TargetSequenceType.PROTEIN:
185+
target_args = "-q=prot -t=dnax"
186+
elif target_sequence_type == TargetSequenceType.DNA:
187+
target_args = ""
188+
else:
189+
# TODO consider implementing support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments.
190+
msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported."
191+
raise NotImplementedError(msg)
192+
process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
193+
out_file = _write_blat_output_tempfile(process_result)
194+
195+
try:
196+
output = parse_blat(out_file, "blat-psl")
197+
198+
# TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied?
199+
except ValueError:
200+
target_args = "-q=dnax -t=dnax"
201+
process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
202+
out_file = _write_blat_output_tempfile(process_result)
203+
try:
204+
output = parse_blat(out_file, "blat-psl")
205+
except ValueError as e:
206+
msg = f"Unable to run successful BLAT on {metadata.urn}"
207+
raise AlignmentError(msg) from e
208+
209+
return output
213210

214211

215212
def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit:
@@ -342,3 +339,106 @@ def align(
342339
target_gene = scoreset_metadata.target_genes[target_label]
343340
alignment_results[target_label] = _get_best_match(blat_result, target_gene)
344341
return alignment_results
342+
343+
344+
def fetch_alignment(
345+
metadata: ScoresetMetadata, silent: bool
346+
) -> dict[str, AlignmentResult | None]:
347+
alignment_results = {}
348+
for target_gene in metadata.target_genes:
349+
accession_id = metadata.target_genes[target_gene].target_accession_id
350+
# protein and contig/chromosome accession ids do not need to be aligned to the genome
351+
if accession_id.startswith(("NP", "ENSP", "NC_")):
352+
alignment_results[accession_id] = None
353+
else:
354+
url = f"https://cdot.cc/transcript/{accession_id}"
355+
r = requests.get(url, timeout=30)
356+
357+
try:
358+
r.raise_for_status()
359+
except requests.HTTPError as e:
360+
msg = f"Received HTTPError from {url} for scoreset {metadata.urn}"
361+
_logger.error(msg)
362+
raise ResourceAcquisitionError(msg) from e
363+
364+
cdot_mapping = r.json()
365+
alignment_results[accession_id] = parse_cdot_mapping(cdot_mapping, silent)
366+
return alignment_results
367+
368+
369+
def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult:
370+
# blat psl & AlignmentResult: 0-based, start inclusive, stop exclusive
371+
# cdot: 1-based, start inclusive, stop inclusive
372+
# so, to "translate" cdot ranges to AlignmentResult-style ranges:
373+
# subtract 1 from start and end to go from 1-based to 0-based coord,
374+
# and then add 1 to the stop to go from inclusive to exclusive
375+
# so just subtract 1 from start and do nothing to end
376+
377+
grch38 = cdot_mapping.get("genome_builds", {}).get("GRCh38")
378+
grch37 = cdot_mapping.get("genome_builds", {}).get("GRCh37")
379+
mapping = grch38 if grch38 else grch37
380+
if mapping is None:
381+
msg = f"Cdot transcript results for transcript {cdot_mapping.get('id')} do not include GRCh37 or GRCh38 mapping"
382+
raise AlignmentError(msg)
383+
384+
chrom = mapping["contig"]
385+
strand = Strand.POSITIVE if mapping["strand"] == "+" else Strand.NEGATIVE
386+
query_subranges = []
387+
hit_subranges = []
388+
for exon in mapping["exons"]:
389+
query_subranges.append(SequenceRange(start=exon[3] - 1, end=exon[4]))
390+
hit_subranges.append(SequenceRange(start=exon[0] - 1, end=exon[1]))
391+
392+
if strand == Strand.POSITIVE:
393+
query_range = SequenceRange(
394+
start=query_subranges[0].start, end=query_subranges[-1].end
395+
)
396+
hit_range = SequenceRange(
397+
start=hit_subranges[0].start, end=hit_subranges[-1].end
398+
)
399+
else:
400+
query_range = SequenceRange(
401+
start=query_subranges[-1].start, end=query_subranges[0].end
402+
)
403+
hit_range = SequenceRange(
404+
start=hit_subranges[-1].start, end=hit_subranges[0].end
405+
)
406+
407+
return AlignmentResult(
408+
chrom=chrom,
409+
strand=strand,
410+
query_range=query_range,
411+
query_subranges=query_subranges,
412+
hit_range=hit_range,
413+
hit_subranges=hit_subranges,
414+
)
415+
416+
417+
def build_alignment_result(
418+
metadata: ScoresetMetadata, silent: bool
419+
) -> dict[str, AlignmentResult | None]:
420+
# NOTE: Score set must contain all accession-based target genes or all sequence-based target genes
421+
# This decision was made because it is most efficient to run BLAT all together, so the alignment function
422+
# works on an entire score set rather than per target gene.
423+
# However, if the need arises, we can allow both types of target genes in a score set.
424+
425+
# determine whether score set is accession-based or sequence-based
426+
score_set_type = None
427+
for target_gene in metadata.target_genes:
428+
if metadata.target_genes[target_gene].target_accession_id:
429+
if score_set_type == "sequence":
430+
msg = "Score set contains both accession-based and sequence-based target genes. This is not currently supported."
431+
raise ScoresetNotSupportedError(msg)
432+
score_set_type = "accession"
433+
else:
434+
if score_set_type == "accession":
435+
msg = "Score set contains both accession-based and sequence-based target genes. This is not currently supported."
436+
raise ScoresetNotSupportedError(msg)
437+
score_set_type = "sequence"
438+
439+
if score_set_type == "sequence":
440+
alignment_result = align(metadata, silent)
441+
else:
442+
alignment_result = fetch_alignment(metadata, silent)
443+
444+
return alignment_result

src/dcd_mapping/annotate.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -419,14 +419,36 @@ def _get_computed_reference_sequence(
419419
metadata: TargetGene,
420420
layer: AnnotationLayer,
421421
tx_output: TxSelectResult | TxSelectError | None = None,
422-
) -> ComputedReferenceSequence | None:
422+
) -> ComputedReferenceSequence | MappedReferenceSequence | None:
423423
"""Report the computed reference sequence for a score set
424424
425425
:param metadata: Target gene metadata from MaveDB API
426426
:param layer: AnnotationLayer
427427
:param tx_output: Transcript data for a score set
428-
:return A ComputedReferenceSequence object
428+
:return A ComputedReferenceSequence object,
429+
or if the target gene is accession-based, a mapped reference sequence describing the pre-mapped reference
429430
"""
431+
# accession-based target genes always use accession id as pre-mapped reference sequence
432+
if metadata.target_accession_id:
433+
seq_id = get_vrs_id_from_identifier(metadata.target_accession_id)
434+
# use MappedReferenceSequence type because there should be an accession id but no sequence.
435+
# for accession-based target genes, the object returned by this function describes the provided reference accession
436+
# whereas the object returned by _get_mapped_reference_sequence describes the mapped reference accession, which could be a chromosome for ex.
437+
seq_type: TargetSequenceType
438+
# TODO full list of protein accession id prefixes
439+
if metadata.target_accession_id.startswith(("NP", "ENSP")):
440+
seq_type = TargetSequenceType.PROTEIN
441+
# TODO full list of transcript and contig accession id prefixes
442+
elif metadata.target_accession_id.startswith(("NM", "ENST", "NC")):
443+
seq_type = TargetSequenceType.DNA
444+
else:
445+
msg = f"Unrecognized accession prefix for accession id {metadata.target_accession_id}"
446+
raise ValueError(msg)
447+
return MappedReferenceSequence(
448+
sequence_type=seq_type,
449+
sequence_id=seq_id,
450+
sequence_accessions=[metadata.target_accession_id],
451+
)
430452
if layer == AnnotationLayer.PROTEIN:
431453
if tx_output is None or isinstance(tx_output, TxSelectError):
432454
# TODO catch this error - don't stop whole job for one failed target

src/dcd_mapping/lookup.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@
1212
import os
1313
from pathlib import Path
1414

15+
import hgvs
1516
import polars as pl
1617
import requests
1718
from biocommons.seqrepo import SeqRepo
1819
from biocommons.seqrepo.seqaliasdb.seqaliasdb import sqlite3
20+
from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider
1921
from cool_seq_tool.app import (
2022
LRG_REFSEQGENE_PATH,
2123
MANE_SUMMARY_PATH,
@@ -42,6 +44,7 @@
4244
)
4345
from ga4gh.vrs.dataproxy import SeqRepoDataProxy, coerce_namespace
4446
from ga4gh.vrs.extras.translator import AlleleTranslator
47+
from ga4gh.vrs.utils.hgvs_tools import HgvsTools
4548
from gene.database import create_db
4649
from gene.query import QueryHandler
4750
from gene.schemas import MatchType, SourceName
@@ -70,6 +73,23 @@
7073
]
7174
_logger = logging.getLogger(__name__)
7275

76+
# ---------------------------------- Cdot ---------------------------------- #
77+
78+
79+
GENOMIC_FASTA_FILES = [
80+
"/home/.local/share/dcd_mapping/GCF_000001405.39_GRCh38.p13_genomic.fna.gz",
81+
"/home/.local/share/dcd_mapping/GCF_000001405.25_GRCh37.p13_genomic.fna.gz",
82+
]
83+
84+
85+
def seqfetcher() -> ChainedSeqFetcher:
86+
return ChainedSeqFetcher(*[FastaSeqFetcher(file) for file in GENOMIC_FASTA_FILES])
87+
88+
89+
def cdot_rest() -> RESTDataProvider:
90+
return RESTDataProvider(seqfetcher=seqfetcher())
91+
92+
7393
# ---------------------------------- Global ---------------------------------- #
7494

7595

@@ -180,6 +200,15 @@ def __new__(cls) -> QueryHandler:
180200
return cls.instance
181201

182202

203+
def init_hgvs_tools(self, data_proxy=None): # noqa: ANN202, ANN001
204+
"""Initialize HgvsTools with cdot as data provider"""
205+
self.parser = hgvs.parser.Parser()
206+
self.data_proxy = data_proxy
207+
cdot_provider = cdot_rest()
208+
self.normalizer = hgvs.normalizer.Normalizer(cdot_provider, validate=True)
209+
self.variant_mapper = hgvs.variantmapper.VariantMapper(cdot_provider)
210+
211+
183212
class TranslatorBuilder:
184213
"""Singleton constructor for VRS Translator instance."""
185214

@@ -190,6 +219,8 @@ def __new__(cls, data_proxy: SeqRepoDataProxy) -> AlleleTranslator:
190219
:return: singleton instance of ``AlleleTranslator``
191220
"""
192221
if not hasattr(cls, "instance"):
222+
# monkey patch to use cdot instead of UTA as HgvsTools data provider
223+
HgvsTools.__init__ = init_hgvs_tools
193224
tr = AlleleTranslator(data_proxy)
194225
cls.instance = tr
195226
else:
@@ -430,6 +461,11 @@ def get_chromosome_identifier(chromosome: str) -> str:
430461
:return: latest ID if available
431462
:raise KeyError: if unable to retrieve identifier
432463
"""
464+
# target sequence alignment references are chromosome names like ``"8"``, ``"X"``
465+
# but accession alignment information from cdot has reference accessions, beginning with "NC_"
466+
# for "NC_" identifiers, just return the identifier
467+
if chromosome.startswith("NC_"):
468+
return chromosome
433469
if not chromosome.startswith("chr"):
434470
chromosome = f"chr{chromosome}"
435471
sr = get_seqrepo()

src/dcd_mapping/main.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import click
99
from requests import HTTPError
1010

11-
from dcd_mapping.align import AlignmentError, BlatNotFoundError, align
11+
from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result
1212
from dcd_mapping.annotate import (
1313
annotate,
1414
save_mapped_output_json,
@@ -156,7 +156,8 @@ async def map_scoreset(
156156

157157
_emit_info(f"Performing alignment for {metadata.urn}...", silent)
158158
try:
159-
alignment_results = align(metadata, silent)
159+
# dictionary where keys are target gene labels or accession ids, and values are alignment result objects
160+
alignment_results = build_alignment_result(metadata, silent)
160161
except BlatNotFoundError as e:
161162
msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more."
162163
_emit_info(msg, silent, logging.ERROR)
@@ -175,6 +176,15 @@ async def map_scoreset(
175176
)
176177
_emit_info(f"Score set mapping output saved to: {final_output}.", silent)
177178
return
179+
except ScoresetNotSupportedError as e:
180+
_emit_info(f"Score set not supported: {e}", silent, logging.ERROR)
181+
final_output = write_scoreset_mapping_to_json(
182+
metadata.urn,
183+
ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")),
184+
output_path,
185+
)
186+
_emit_info(f"Score set mapping output saved to: {final_output}.", silent)
187+
return
178188
_emit_info("Alignment complete.", silent)
179189

180190
_emit_info("Selecting reference sequence...", silent)

0 commit comments

Comments
 (0)