diff --git a/Dockerfile b/Dockerfile index 363ee58..a38827f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,6 +37,12 @@ RUN curl -L https://github.com/samtools/htslib/releases/download/${htsversion}/h curl -L https://github.com/samtools/bcftools/releases/download/${htsversion}/bcftools-${htsversion}.tar.bz2 | tar xj && \ (cd bcftools-${htsversion} && ./configure --enable-libgsl --enable-perl-filters --with-htslib=system && make install) +# Fetch and index GRCh37 and GRCh38 assemblies for cdot +RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz | gzip -d | bgzip > GCF_000001405.25_GRCh37.p13_genomic.fna.gz +RUN wget -O - https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz | gzip -d | bgzip > GCF_000001405.39_GRCh38.p13_genomic.fna.gz +RUN samtools faidx GCF_000001405.25_GRCh37.p13_genomic.fna.gz +RUN samtools faidx GCF_000001405.39_GRCh38.p13_genomic.fna.gz + RUN mkdir /usr/src/app WORKDIR /usr/src/app COPY . . diff --git a/pyproject.toml b/pyproject.toml index 331d3dc..e7568e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "requests", "biopython", "tqdm", + "cdot", "click", "cool-seq-tool==0.4.0.dev3", "ga4gh.vrs==2.0.0-a6", diff --git a/settings/.env.dev b/settings/.env.dev index 4e761c7..956bfae 100644 --- a/settings/.env.dev +++ b/settings/.env.dev @@ -17,7 +17,7 @@ POSTGRES_DB=gene_normalizer # Environment variables for UTA connection via CoolSeqTool #################################################################################################### -UTA_DB_URL=postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20180821 +UTA_DB_URL=postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20241220 #################################################################################################### # Environment variables for MaveDB connection diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 2f34def..2e65d2a 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -6,7 +6,7 @@ from fastapi.responses import JSONResponse from requests import HTTPError -from dcd_mapping.align import AlignmentError, BlatNotFoundError, align +from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result from dcd_mapping.annotate import ( _get_computed_reference_sequence, _get_mapped_reference_sequence, @@ -22,8 +22,14 @@ with_mavedb_score_set, ) from dcd_mapping.resource_utils import ResourceAcquisitionError -from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion -from dcd_mapping.transcripts import TxSelectError, select_transcript +from dcd_mapping.schemas import ( + ScoreAnnotation, + ScoresetMapping, + TargetType, + TxSelectResult, + VrsVersion, +) +from dcd_mapping.transcripts import select_transcripts from dcd_mapping.vrs_map import VrsMapError, vrs_map router = APIRouter( @@ -33,21 +39,21 @@ @router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping) @with_mavedb_score_set -async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapping: +async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse: """Perform end-to-end mapping for a scoreset. :param urn: identifier for a scoreset. - :param output_path: optional path to save output at - :param vrs_version: version of VRS objects to output (1.3 or 2) - :param silent: if True, suppress console information output + :param store_path: optional path to save output at """ try: metadata = get_scoreset_metadata(urn, store_path) - records = get_scoreset_records(urn, True, store_path) + records = get_scoreset_records(metadata, True, store_path) except ScoresetNotSupportedError as e: - return ScoresetMapping( - metadata=None, - error_message=str(e).strip("'"), + return JSONResponse( + content=ScoresetMapping( + metadata=None, + error_message=str(e).strip("'"), + ).model_dump(exclude_none=True) ) except ResourceAcquisitionError as e: msg = f"Unable to acquire resource from MaveDB: {e}" @@ -62,7 +68,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp ) try: - alignment_result = align(metadata, True) + alignment_results = build_alignment_result(metadata, True) except BlatNotFoundError as e: msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more." raise HTTPException(status_code=500, detail=msg) from e @@ -75,15 +81,20 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - - try: - transcript = await select_transcript(metadata, records, alignment_result) - except (TxSelectError, KeyError, ValueError) as e: + except ScoresetNotSupportedError as e: return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) + + try: + transcripts = await select_transcripts(metadata, records, alignment_results) + # NOTE: transcript selection errors are handled in select_transcripts, + # and they do not cause the entire mapping process to exit; instead, an error will be reported + # on the target level and on the variant level for variants relative to that target + # HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate + # underlying issues with data providers. except HTTPError as e: msg = f"HTTP error occurred during transcript selection: {e}" raise HTTPException(status_code=500, detail=msg) from e @@ -91,61 +102,130 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp msg = f"Data lookup error occurred during transcript selection: {e}" raise HTTPException(status_code=500, detail=msg) from e + vrs_results = {} try: - vrs_results = vrs_map(metadata, alignment_result, records, transcript, True) + for target_gene in metadata.target_genes: + vrs_results[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=alignment_results[target_gene], + records=records[target_gene], + transcript=transcripts[target_gene], + silent=True, + ) except VrsMapError as e: return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - if vrs_results is None: - return ScoresetMapping( - metadata=metadata, - error_message="No variant mappings available for this score set", + if not vrs_results or all( + mapping_result is None for mapping_result in vrs_results.values() + ): + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, + error_message="No variant mappings available for this score set", + ).model_dump(exclude_none=True) ) + annotated_vrs_results = {} try: - vrs_results = annotate(vrs_results, transcript, metadata, VrsVersion.V_2) + for target_gene in vrs_results: + annotated_vrs_results[target_gene] = annotate( + vrs_results[target_gene], + transcripts[target_gene], + metadata.target_genes[target_gene], + metadata.urn, + VrsVersion.V_2, + ) except Exception as e: return JSONResponse( content=ScoresetMapping( metadata=metadata, error_message=str(e).strip("'") ).model_dump(exclude_none=True) ) - if vrs_results is None: - return ScoresetMapping( - metadata=metadata, - error_message="No annotated variant mappings available for this score set", + if not annotated_vrs_results or all( + mapping_result is None for mapping_result in annotated_vrs_results.values() + ): + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, + error_message="No annotated variant mappings available for this score set", + ).model_dump(exclude_none=True) ) try: raw_metadata = get_raw_scoreset_metadata(urn, store_path) - preferred_layers = { - _set_scoreset_layer(urn, vrs_results), - } - - reference_sequences = { - layer: { - "computed_reference_sequence": None, - "mapped_reference_sequence": None, + reference_sequences: dict[str, dict] = {} + mapped_scores: list[ScoreAnnotation] = [] + for target_gene in annotated_vrs_results: + preferred_layers = { + _set_scoreset_layer(urn, annotated_vrs_results[target_gene]), } - for layer in AnnotationLayer - } - - for layer in preferred_layers: - reference_sequences[layer][ - "computed_reference_sequence" - ] = _get_computed_reference_sequence(metadata, layer, transcript) - reference_sequences[layer][ - "mapped_reference_sequence" - ] = _get_mapped_reference_sequence(layer, transcript, alignment_result) + target_gene_name = metadata.target_genes[target_gene].target_gene_name + reference_sequences[target_gene_name] = { + layer: { + "computed_reference_sequence": None, + "mapped_reference_sequence": None, + } + for layer in preferred_layers + } + # sometimes Nonetype layers show up in preferred layers dict; remove these + preferred_layers.discard(None) + for layer in preferred_layers: + reference_sequences[target_gene_name][layer][ + "computed_reference_sequence" + ] = _get_computed_reference_sequence( + metadata.target_genes[target_gene], layer, transcripts[target_gene] + ) + reference_sequences[target_gene_name][layer][ + "mapped_reference_sequence" + ] = _get_mapped_reference_sequence( + metadata.target_genes[target_gene], + layer, + transcripts[target_gene], + alignment_results[target_gene], + ) + + for m in annotated_vrs_results[target_gene]: + if m.pre_mapped is None: + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + elif m.annotation_layer in preferred_layers: + # drop annotation layer from mapping object + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + + # drop Nonetype reference sequences + for target_gene in reference_sequences: + for layer in list(reference_sequences[target_gene].keys()): + if ( + reference_sequences[target_gene][layer][ + "mapped_reference_sequence" + ] + is None + and reference_sequences[target_gene][layer][ + "computed_reference_sequence" + ] + is None + ) or layer is None: + del reference_sequences[target_gene][layer] + + # if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict + if ( + AnnotationLayer.GENOMIC in reference_sequences[target_gene_name] + and metadata.target_genes[target_gene].target_gene_category + == TargetType.PROTEIN_CODING + and metadata.target_genes[target_gene].target_accession_id is None + and transcripts[target_gene] is not None + and isinstance(transcripts[target_gene], TxSelectResult) + and transcripts[target_gene].nm is not None + ): + reference_sequences[target_gene_name][AnnotationLayer.CDNA] = { + "computed_reference_sequence": None, + "mapped_reference_sequence": { + "sequence_accessions": [transcripts[target_gene].nm] + }, + } - mapped_scores: list[ScoreAnnotation] = [] - for m in vrs_results: - if m.annotation_layer in preferred_layers: - # drop annotation layer from mapping object - mapped_scores.append(ScoreAnnotation(**m.model_dump())) except Exception as e: return JSONResponse( content=ScoresetMapping( @@ -156,18 +236,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp return JSONResponse( content=ScoresetMapping( metadata=raw_metadata, - computed_protein_reference_sequence=reference_sequences[ - AnnotationLayer.PROTEIN - ]["computed_reference_sequence"], - mapped_protein_reference_sequence=reference_sequences[ - AnnotationLayer.PROTEIN - ]["mapped_reference_sequence"], - computed_genomic_reference_sequence=reference_sequences[ - AnnotationLayer.GENOMIC - ]["computed_reference_sequence"], - mapped_genomic_reference_sequence=reference_sequences[ - AnnotationLayer.GENOMIC - ]["mapped_reference_sequence"], + reference_sequences=reference_sequences, mapped_scores=mapped_scores, ).model_dump(exclude_none=True) ) diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index db2d3f8..b64f67a 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -8,14 +8,12 @@ import requests from Bio.SearchIO import HSP -from Bio.SearchIO import read as read_blat +from Bio.SearchIO import parse as parse_blat from Bio.SearchIO._model import Hit, QueryResult from cool_seq_tool.schemas import Strand from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location -from dcd_mapping.mavedb_data import ( - LOCAL_STORE_PATH, -) +from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, ScoresetNotSupportedError from dcd_mapping.resource_utils import ( ResourceAcquisitionError, http_download, @@ -25,6 +23,7 @@ GeneLocation, ScoresetMetadata, SequenceRange, + TargetGene, TargetSequenceType, ) @@ -61,7 +60,10 @@ def _build_query_file(scoreset_metadata: ScoresetMetadata, query_file: Path) -> :return: Yielded Path to constructed file. Deletes file once complete. """ _logger.debug("Writing BLAT query to %s", query_file) - lines = [">query", scoreset_metadata.target_sequence] + lines = [] + for target_gene in scoreset_metadata.target_genes: + lines.append(f">{target_gene}") + lines.append(scoreset_metadata.target_genes[target_gene].target_sequence) _write_query_file(query_file, lines) return query_file @@ -143,7 +145,30 @@ def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str: return tmp.name -def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult: +def _get_target_sequence_type(metadata: ScoresetMetadata) -> TargetSequenceType | str: + """Get overall target sequence type for a score set's target genes. + Protein if all target sequences are protein sequences, nucleotide if all target + sequences are nucleotide sequences, and mixed if there is a mix within the score set. + :param metadata: object containing score set attributes + :return: TargetSequenceType enum (protein or nucleotide) or string "mixed" + """ + target_sequence_types = set() + for target_gene in metadata.target_genes: + target_sequence_types.add( + metadata.target_genes[target_gene].target_sequence_type + ) + if len(target_sequence_types) > 1: + return "mixed" + elif len(target_sequence_types) == 1: # noqa: RET505 + return target_sequence_types.pop() + else: + msg = f"Target sequence types not available for score set {metadata.urn}" + raise ValueError(msg) + + +def _get_blat_output( + metadata: ScoresetMetadata, silent: bool +) -> dict[str, QueryResult]: """Run a BLAT query and returns a path to the output object. If unable to produce a valid query the first time, then try a query using ``dnax`` @@ -151,26 +176,32 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult: :param scoreset_metadata: object containing scoreset attributes :param silent: suppress BLAT command output - :return: BLAT query result + :return: dict where keys are target gene identifiers and values are BLAT query result objects :raise AlignmentError: if BLAT subprocess returns error code """ with tempfile.NamedTemporaryFile() as tmp_file: query_file = _build_query_file(metadata, Path(tmp_file.name)) - if metadata.target_sequence_type == TargetSequenceType.PROTEIN: + target_sequence_type = _get_target_sequence_type(metadata) + if target_sequence_type == TargetSequenceType.PROTEIN: target_args = "-q=prot -t=dnax" - else: + elif target_sequence_type == TargetSequenceType.DNA: target_args = "" + else: + # TODO consider implementing support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments. + msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported." + raise NotImplementedError(msg) process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) out_file = _write_blat_output_tempfile(process_result) try: - output = read_blat(out_file, "blat-psl") + output = parse_blat(out_file, "blat-psl") + except ValueError: target_args = "-q=dnax -t=dnax" process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) out_file = _write_blat_output_tempfile(process_result) try: - output = read_blat(out_file, "blat-psl") + output = parse_blat(out_file, "blat-psl") except ValueError as e: msg = f"Unable to run successful BLAT on {metadata.urn}" raise AlignmentError(msg) from e @@ -178,7 +209,7 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult: return output -def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: +def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit: """Get best hit from BLAT output. First, try to return hit corresponding to expected chromosome taken from scoreset @@ -186,7 +217,6 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: the hit with the single highest-scoring HSP. :param output: BLAT output - :param urn: scoreset URN to use in error messages :param chromosome: refseq chromosome ID, e.g. ``"NC_000001.11"`` :return: best Hit :raise AlignmentError: if unable to get hits from output @@ -207,8 +237,8 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: hit_chrs = [h.id for h in output] # TODO should this be an error rather than a warning? it seems like a problem if we can't find a hit on the expected chromosome _logger.warning( - "Failed to match hit chromosomes during alignment. URN: %s, expected chromosome: %s, hit chromosomes: %s", - urn, + "Failed to match hit chromosomes during alignment for target %s. Expected chromosome: %s, hit chromosomes: %s", + output.id, chromosome, hit_chrs, ) @@ -222,13 +252,13 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: best_score_hit = hit if best_score_hit is None: - msg = f"Couldn't get BLAT hits from {urn}" + msg = f"Couldn't get BLAT hits for target {output.id}." raise AlignmentError(msg) return best_score_hit -def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP: +def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None) -> HSP: """Retrieve preferred HSP from BLAT Hit object. If gene location data is available, prefer the HSP with the least distance @@ -236,7 +266,6 @@ def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP take the HSP with the highest score value. :param hit: hit object from BLAT result - :param urn: scoreset identifier for use in error messages :param gene_location: location data acquired by normalizing scoreset metadata :return: Preferred HSP object :raise AlignmentError: if hit object appears to be empty (should be impossible) @@ -252,17 +281,17 @@ def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP return best_hsp -def _get_best_match(output: QueryResult, metadata: ScoresetMetadata) -> AlignmentResult: +def _get_best_match(output: QueryResult, target_gene: TargetGene) -> AlignmentResult: """Obtain best high-scoring pairs (HSP) object for query sequence. :param metadata: scoreset metadata :param output: BLAT result object :return: alignment result ?? """ - location = get_gene_location(metadata) + location = get_gene_location(target_gene) chromosome = location.chromosome if location else None - best_hit = _get_best_hit(output, metadata.urn, chromosome) - best_hsp = _get_best_hsp(best_hit, metadata.urn, location) + best_hit = _get_best_hit(output, chromosome) + best_hsp = _get_best_hsp(best_hit, location) strand = Strand.POSITIVE if best_hsp[0].query_strand == 1 else Strand.NEGATIVE coverage = 100 * (best_hsp.query_end - best_hsp.query_start) / output.seq_len @@ -291,12 +320,150 @@ def _get_best_match(output: QueryResult, metadata: ScoresetMetadata) -> Alignmen ) -def align(scoreset_metadata: ScoresetMetadata, silent: bool = True) -> AlignmentResult: +def align( + scoreset_metadata: ScoresetMetadata, silent: bool = True +) -> dict[str, AlignmentResult]: """Align target sequence to a reference genome. :param scoreset_metadata: object containing scoreset metadata :param silent: suppress BLAT process output if true - :return: data wrapper containing alignment results + :return: dictionary where keys are target gene identifiers and values are alignment result objects """ blat_output = _get_blat_output(scoreset_metadata, silent) - return _get_best_match(blat_output, scoreset_metadata) + alignment_results = {} + for blat_result in blat_output: + target_label = blat_result.id + # blat names the result id "query" if there is only one query; replace "query" with the target gene name for single-target score sets + if target_label == "query" and len(scoreset_metadata.target_genes) == 1: + target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 + # blat automatically reformats query names, so sometimes they don't match our metadata + if target_label not in scoreset_metadata.target_genes: + # if single-target score set, don't need to match by name + if len(scoreset_metadata.target_genes) == 1: + target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015 + else: + # try to match query name to a target gene in the metadata + matches = 0 + for target_gene_name in scoreset_metadata.target_genes: + blat_target_gene_name = ( + target_gene_name.split(" ")[0] + .replace("(", "") + .replace(")", "") + .replace(",", "") + ) + if blat_target_gene_name == target_label: + target_label = target_gene_name + matches += 1 + # we may be missing some blat reformatting rules here - if so, this error will be thrown + if matches == 0: + msg = f"BLAT result {target_label} does not match any target gene names in scoreset {scoreset_metadata.urn}." + raise AlignmentError(msg) + if matches > 1: + # could happen if multiple target genes have the same first word in their label (unlikely) + msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}" + target_gene = scoreset_metadata.target_genes[target_label] + alignment_results[target_label] = _get_best_match(blat_result, target_gene) + return alignment_results + + +def fetch_alignment( + metadata: ScoresetMetadata, silent: bool +) -> dict[str, AlignmentResult | None]: + alignment_results = {} + for target_gene in metadata.target_genes: + accession_id = metadata.target_genes[target_gene].target_accession_id + # protein and contig/chromosome accession ids do not need to be aligned to the genome + if accession_id.startswith(("NP", "ENSP", "NC_")): + alignment_results[accession_id] = None + else: + url = f"https://cdot.cc/transcript/{accession_id}" + r = requests.get(url, timeout=30) + + try: + r.raise_for_status() + except requests.HTTPError as e: + msg = f"Received HTTPError from {url} for scoreset {metadata.urn}" + _logger.error(msg) + raise ResourceAcquisitionError(msg) from e + + cdot_mapping = r.json() + alignment_results[accession_id] = parse_cdot_mapping(cdot_mapping, silent) + return alignment_results + + +def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult: + # blat psl & AlignmentResult: 0-based, start inclusive, stop exclusive + # cdot: 1-based, start inclusive, stop inclusive + # so, to "translate" cdot ranges to AlignmentResult-style ranges: + # subtract 1 from start and end to go from 1-based to 0-based coord, + # and then add 1 to the stop to go from inclusive to exclusive + # so just subtract 1 from start and do nothing to end + + grch38 = cdot_mapping.get("genome_builds", {}).get("GRCh38") + grch37 = cdot_mapping.get("genome_builds", {}).get("GRCh37") + mapping = grch38 if grch38 else grch37 + if mapping is None: + msg = f"Cdot transcript results for transcript {cdot_mapping.get('id')} do not include GRCh37 or GRCh38 mapping" + raise AlignmentError(msg) + + chrom = mapping["contig"] + strand = Strand.POSITIVE if mapping["strand"] == "+" else Strand.NEGATIVE + query_subranges = [] + hit_subranges = [] + for exon in mapping["exons"]: + query_subranges.append(SequenceRange(start=exon[3] - 1, end=exon[4])) + hit_subranges.append(SequenceRange(start=exon[0] - 1, end=exon[1])) + + if strand == Strand.POSITIVE: + query_range = SequenceRange( + start=query_subranges[0].start, end=query_subranges[-1].end + ) + hit_range = SequenceRange( + start=hit_subranges[0].start, end=hit_subranges[-1].end + ) + else: + query_range = SequenceRange( + start=query_subranges[-1].start, end=query_subranges[0].end + ) + hit_range = SequenceRange( + start=hit_subranges[-1].start, end=hit_subranges[0].end + ) + + return AlignmentResult( + chrom=chrom, + strand=strand, + query_range=query_range, + query_subranges=query_subranges, + hit_range=hit_range, + hit_subranges=hit_subranges, + ) + + +def build_alignment_result( + metadata: ScoresetMetadata, silent: bool +) -> dict[str, AlignmentResult | None]: + # NOTE: Score set must contain all accession-based target genes or all sequence-based target genes + # This decision was made because it is most efficient to run BLAT all together, so the alignment function + # works on an entire score set rather than per target gene. + # However, if the need arises, we can allow both types of target genes in a score set. + + # determine whether score set is accession-based or sequence-based + score_set_type = None + for target_gene in metadata.target_genes: + if metadata.target_genes[target_gene].target_accession_id: + if score_set_type == "sequence": + msg = "Score set contains both accession-based and sequence-based target genes. This is not currently supported." + raise ScoresetNotSupportedError(msg) + score_set_type = "accession" + else: + if score_set_type == "accession": + msg = "Score set contains both accession-based and sequence-based target genes. This is not currently supported." + raise ScoresetNotSupportedError(msg) + score_set_type = "sequence" + + if score_set_type == "sequence": + alignment_result = align(metadata, silent) + else: + alignment_result = fetch_alignment(metadata, silent) + + return alignment_result diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index b8f7342..9a9532e 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -39,10 +39,13 @@ ScoreAnnotationWithLayer, ScoresetMapping, ScoresetMetadata, + TargetGene, TargetSequenceType, + TargetType, TxSelectResult, VrsVersion, ) +from dcd_mapping.transcripts import TxSelectError _logger = logging.getLogger(__name__) @@ -133,14 +136,15 @@ def _offset_allele_ref_seq(ss: str, start: int, end: int) -> tuple[int, int]: def _get_vrs_ref_allele_seq( - allele: Allele, metadata: ScoresetMetadata, tx_select_results: TxSelectResult | None + allele: Allele, + metadata: TargetGene, + urn: str, + tx_select_results: TxSelectResult | None, ) -> Extension: """Create `vrs_ref_allele_seq` property.""" - start, end = _offset_allele_ref_seq( - metadata.urn, allele.location.start, allele.location.end - ) + start, end = _offset_allele_ref_seq(urn, allele.location.start, allele.location.end) if ( - metadata.urn.startswith( + urn.startswith( ( "urn:mavedb:00000047", "urn:mavedb:00000048", @@ -149,6 +153,7 @@ def _get_vrs_ref_allele_seq( ) ) and tx_select_results is not None + and isinstance(tx_select_results, TxSelectResult) ): seq = tx_select_results.sequence ref = seq[start:end] @@ -241,8 +246,9 @@ def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]: def _annotate_allele_mapping( mapped_score: MappedScore, - tx_results: TxSelectResult | None, - metadata: ScoresetMetadata, + tx_results: TxSelectResult | TxSelectError | None, + metadata: TargetGene, + urn: str, vrs_version: VrsVersion = VrsVersion.V_2, ) -> ScoreAnnotationWithLayer: """Perform annotations and, if necessary, create VRS 1.3 equivalents for allele mappings.""" @@ -250,7 +256,9 @@ def _annotate_allele_mapping( post_mapped: Allele = mapped_score.post_mapped # get vrs_ref_allele_seq for pre-mapped variants - pre_mapped.extensions = [_get_vrs_ref_allele_seq(pre_mapped, metadata, tx_results)] + pre_mapped.extensions = [ + _get_vrs_ref_allele_seq(pre_mapped, metadata, urn, tx_results) + ] if post_mapped: # Determine reference sequence @@ -262,7 +270,7 @@ def _annotate_allele_mapping( if accession.startswith("refseq:"): accession = accession[7:] else: - if tx_results is None: + if tx_results is None or isinstance(tx_results, TxSelectError): raise ValueError # impossible by definition accession = tx_results.np @@ -295,8 +303,9 @@ def _annotate_allele_mapping( def _annotate_haplotype_mapping( mapped_score: MappedScore, - tx_results: TxSelectResult | None, - metadata: ScoresetMetadata, + tx_results: TxSelectResult | TxSelectError | None, + metadata: TargetGene, + urn: str, vrs_version: VrsVersion = VrsVersion.V_2, ) -> ScoreAnnotationWithLayer: """Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings.""" @@ -304,7 +313,7 @@ def _annotate_haplotype_mapping( post_mapped: Haplotype = mapped_score.post_mapped # type: ignore # get vrs_ref_allele_seq for pre-mapped variants for allele in pre_mapped.members: - allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, tx_results)] + allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, urn, tx_results)] if post_mapped: # Determine reference sequence @@ -316,7 +325,7 @@ def _annotate_haplotype_mapping( if accession.startswith("refseq:"): accession = accession[7:] else: - if tx_results is None: + if tx_results is None or isinstance(tx_results, TxSelectError): raise ValueError # impossible by definition accession = tx_results.np @@ -350,8 +359,9 @@ def _annotate_haplotype_mapping( def annotate( mapped_scores: list[MappedScore], - tx_results: TxSelectResult | None, - metadata: ScoresetMetadata, + tx_results: TxSelectResult | TxSelectError | None, + metadata: TargetGene, + urn: str, vrs_version: VrsVersion = VrsVersion.V_2, ) -> list[ScoreAnnotationWithLayer]: """Given a list of mappings, add additional contextual data: @@ -367,7 +377,8 @@ def annotate( :param vrs_results: in-progress variant mappings :param tx_select_results: transcript selection if available - :param metadata: MaveDB scoreset metadata + :param metadata: Target gene metadata from MaveDB API + :param urn: Score set URN :return: annotated mappings objects """ score_annotations = [] @@ -386,7 +397,7 @@ def annotate( ): score_annotations.append( _annotate_haplotype_mapping( - mapped_score, tx_results, metadata, vrs_version + mapped_score, tx_results, metadata, urn, vrs_version ) ) elif isinstance(mapped_score.pre_mapped, Allele) and ( @@ -395,7 +406,7 @@ def annotate( ): score_annotations.append( _annotate_allele_mapping( - mapped_score, tx_results, metadata, vrs_version + mapped_score, tx_results, metadata, urn, vrs_version ) ) else: @@ -406,20 +417,40 @@ def annotate( def _get_computed_reference_sequence( - metadata: ScoresetMetadata, + metadata: TargetGene, layer: AnnotationLayer, - tx_output: TxSelectResult | None = None, -) -> ComputedReferenceSequence: + tx_output: TxSelectResult | TxSelectError | None = None, +) -> ComputedReferenceSequence | MappedReferenceSequence | None: """Report the computed reference sequence for a score set - :param ss: A score set string + :param metadata: Target gene metadata from MaveDB API :param layer: AnnotationLayer :param tx_output: Transcript data for a score set - :return A ComputedReferenceSequence object + :return A ComputedReferenceSequence object, + or if the target gene is accession-based, a mapped reference sequence describing the pre-mapped reference """ + # accession-based target genes always use accession id as pre-mapped reference sequence + if metadata.target_accession_id: + seq_id = get_vrs_id_from_identifier(metadata.target_accession_id) + # use MappedReferenceSequence type because there should be an accession id but no sequence. + # for accession-based target genes, the object returned by this function describes the provided reference accession + # whereas the object returned by _get_mapped_reference_sequence describes the mapped reference accession, which could be a chromosome for ex. + seq_type: TargetSequenceType + if metadata.target_accession_id.startswith(("NP", "ENSP")): + seq_type = TargetSequenceType.PROTEIN + elif metadata.target_accession_id.startswith(("NM", "ENST", "NC")): + seq_type = TargetSequenceType.DNA + else: + msg = f"Unrecognized accession prefix for accession id {metadata.target_accession_id}" + raise ValueError(msg) + return MappedReferenceSequence( + sequence_type=seq_type, + sequence_id=seq_id, + sequence_accessions=[metadata.target_accession_id], + ) if layer == AnnotationLayer.PROTEIN: - if tx_output is None: - raise ValueError + if tx_output is None or isinstance(tx_output, TxSelectError): + return None seq_id = f"ga4gh:SQ.{sha512t24u(tx_output.sequence.encode('ascii'))}" return ComputedReferenceSequence( sequence=tx_output.sequence, @@ -435,35 +466,45 @@ def _get_computed_reference_sequence( def _get_mapped_reference_sequence( + metadata: TargetGene, layer: AnnotationLayer, - tx_output: TxSelectResult | None = None, + tx_output: TxSelectResult | TxSelectError | None = None, align_result: AlignmentResult | None = None, -) -> MappedReferenceSequence: +) -> MappedReferenceSequence | None: """Report the mapped reference sequence for a score set - :param ss: A score set string + :param metadata: Target gene metadata from MaveDB API :param layer: AnnotationLayer :param tx_output: Transcript data for a score set :return A MappedReferenceSequence object """ - if layer == AnnotationLayer.PROTEIN and tx_output is not None: + if ( + layer == AnnotationLayer.PROTEIN + and tx_output is not None + and isinstance(tx_output, TxSelectResult) + ): if tx_output.np is None: - msg = "No NP accession associated with reference transcript" - raise ValueError(msg) + return None vrs_id = get_vrs_id_from_identifier(tx_output.np) if vrs_id is None: - msg = "ID could not be acquired from Seqrepo for transcript identifier" - raise ValueError(msg) + return None return MappedReferenceSequence( sequence_type=TargetSequenceType.PROTEIN, sequence_id=vrs_id, sequence_accessions=[tx_output.np], ) - seq_id = get_chromosome_identifier(align_result.chrom) + # accession-based score sets with genomic accession do not have alignment results + if ( + align_result is None + and metadata.target_accession_id + and metadata.target_accession_id.startswith("NC") + ): + seq_id = metadata.target_accession_id + else: + seq_id = get_chromosome_identifier(align_result.chrom) vrs_id = get_vrs_id_from_identifier(seq_id) if vrs_id is None: - msg = "ID could not be acquired from Seqrepo for chromosome identifier" - raise ValueError(msg) + return None return MappedReferenceSequence( sequence_type=TargetSequenceType.DNA, sequence_id=vrs_id, @@ -513,64 +554,102 @@ def write_scoreset_mapping_to_json( def save_mapped_output_json( metadata: ScoresetMetadata, - mappings: list[ScoreAnnotationWithLayer], - align_result: AlignmentResult, - tx_output: TxSelectResult | None, + mappings: dict[str, ScoreAnnotationWithLayer], + align_results: dict[str, AlignmentResult], + tx_output: dict[str, TxSelectResult | TxSelectError | None], preferred_layer_only: bool = False, output_path: Path | None = None, ) -> Path: """Save mapping output for a score set in a JSON file :param urn: Score set accession - :param mave_vrs_mappings: A dictionary of VrsObject1_x objects - :param align_result: Alignment information for a score set - :param tx_output: Transcript output for a score set + :param mappings: A dictionary of annotated VRS mappings for each target + :param align_result: A dictionary of alignment information for each target + :param tx_output: A dictionary of transcript output for each target :param output_path: specific location to save output to. Default to /urn:mavedb:00000XXX-X-X_mapping_.json :return: output location """ - if preferred_layer_only: - preferred_layers = { - _set_scoreset_layer(metadata.urn, mappings), + # set preferred layers for each target, to allow a mix of coding and noncoding targets + reference_sequences: dict[str, dict] = {} + mapped_scores: list[ScoreAnnotation] = [] + for target_gene in mappings: + if preferred_layer_only: + preferred_layers = { + _set_scoreset_layer(metadata.urn, mappings[target_gene]), + } + else: + preferred_layers = { + mapping.annotation_layer for mapping in mappings[target_gene] + } + + # use target gene name in reference sequence dictionary, rather than the label, which differs between score sets + target_gene_name = metadata.target_genes[target_gene].target_gene_name + + reference_sequences[target_gene_name] = { + layer: { + "computed_reference_sequence": None, + "mapped_reference_sequence": None, + } + for layer in preferred_layers } - else: - preferred_layers = {mapping.annotation_layer for mapping in mappings} - - reference_sequences = { - layer: {"computed_reference_sequence": None, "mapped_reference_sequence": None} - for layer in AnnotationLayer - } - - for layer in preferred_layers: - reference_sequences[layer][ - "computed_reference_sequence" - ] = _get_computed_reference_sequence(metadata, layer, tx_output) - reference_sequences[layer][ - "mapped_reference_sequence" - ] = _get_mapped_reference_sequence(layer, tx_output, align_result) + # sometimes Nonetype layers show up in preferred layers dict; remove these + preferred_layers.discard(None) + for layer in preferred_layers: + reference_sequences[target_gene_name][layer][ + "computed_reference_sequence" + ] = _get_computed_reference_sequence( + metadata.target_genes[target_gene], layer, tx_output[target_gene] + ) + reference_sequences[target_gene_name][layer][ + "mapped_reference_sequence" + ] = _get_mapped_reference_sequence( + metadata.target_genes[target_gene], + layer, + tx_output[target_gene], + align_results[target_gene], + ) - mapped_scores: list[ScoreAnnotation] = [] - for m in mappings: - if m.pre_mapped is None: - mapped_scores.append(ScoreAnnotation(**m.model_dump())) - elif m.annotation_layer in preferred_layers: - # drop annotation layer from mapping object - mapped_scores.append(ScoreAnnotation(**m.model_dump())) + # if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict + if ( + AnnotationLayer.GENOMIC in reference_sequences[target_gene_name] + and metadata.target_genes[target_gene].target_gene_category + == TargetType.PROTEIN_CODING + and metadata.target_genes[target_gene].target_accession_id is None + and tx_output[target_gene] is not None + and isinstance(tx_output[target_gene], TxSelectResult) + and tx_output[target_gene].nm is not None + ): + reference_sequences[target_gene_name][AnnotationLayer.CDNA] = { + "computed_reference_sequence": None, + "mapped_reference_sequence": { + "sequence_accessions": [tx_output[target_gene].nm] + }, + } + + for m in mappings[target_gene]: + if m.pre_mapped is None: + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + elif m.annotation_layer in preferred_layers: + # drop annotation layer from mapping object + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + + # drop Nonetype reference sequences + for target_gene in reference_sequences: + for layer in list(reference_sequences[target_gene].keys()): + if ( + reference_sequences[target_gene][layer]["mapped_reference_sequence"] + is None + and reference_sequences[target_gene][layer][ + "computed_reference_sequence" + ] + is None + ) or layer is None: + del reference_sequences[target_gene][layer] output = ScoresetMapping( metadata=metadata.model_dump(), - computed_protein_reference_sequence=reference_sequences[ - AnnotationLayer.PROTEIN - ]["computed_reference_sequence"], - mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][ - "mapped_reference_sequence" - ], - computed_genomic_reference_sequence=reference_sequences[ - AnnotationLayer.GENOMIC - ]["computed_reference_sequence"], - mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][ - "mapped_reference_sequence" - ], + reference_sequences=reference_sequences, mapped_scores=mapped_scores, ) diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index 871d885..bf53f48 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -12,10 +12,12 @@ import os from pathlib import Path +import hgvs import polars as pl import requests from biocommons.seqrepo import SeqRepo from biocommons.seqrepo.seqaliasdb.seqaliasdb import sqlite3 +from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider from cool_seq_tool.app import ( LRG_REFSEQGENE_PATH, MANE_SUMMARY_PATH, @@ -42,11 +44,16 @@ ) from ga4gh.vrs.dataproxy import SeqRepoDataProxy, coerce_namespace from ga4gh.vrs.extras.translator import AlleleTranslator +from ga4gh.vrs.utils.hgvs_tools import HgvsTools from gene.database import create_db from gene.query import QueryHandler from gene.schemas import MatchType, SourceName -from dcd_mapping.schemas import GeneLocation, ManeDescription, ScoresetMetadata +from dcd_mapping.schemas import ( + GeneLocation, + ManeDescription, + TargetGene, +) __all__ = [ "CoolSeqToolBuilder", @@ -66,6 +73,23 @@ ] _logger = logging.getLogger(__name__) +# ---------------------------------- Cdot ---------------------------------- # + + +GENOMIC_FASTA_FILES = [ + "/home/.local/share/dcd_mapping/GCF_000001405.39_GRCh38.p13_genomic.fna.gz", + "/home/.local/share/dcd_mapping/GCF_000001405.25_GRCh37.p13_genomic.fna.gz", +] + + +def seqfetcher() -> ChainedSeqFetcher: + return ChainedSeqFetcher(*[FastaSeqFetcher(file) for file in GENOMIC_FASTA_FILES]) + + +def cdot_rest() -> RESTDataProvider: + return RESTDataProvider(seqfetcher=seqfetcher()) + + # ---------------------------------- Global ---------------------------------- # @@ -176,6 +200,15 @@ def __new__(cls) -> QueryHandler: return cls.instance +def init_hgvs_tools(self, data_proxy=None): # noqa: ANN202, ANN001 + """Initialize HgvsTools with cdot as data provider""" + self.parser = hgvs.parser.Parser() + self.data_proxy = data_proxy + cdot_provider = cdot_rest() + self.normalizer = hgvs.normalizer.Normalizer(cdot_provider, validate=True) + self.variant_mapper = hgvs.variantmapper.VariantMapper(cdot_provider) + + class TranslatorBuilder: """Singleton constructor for VRS Translator instance.""" @@ -186,6 +219,8 @@ def __new__(cls, data_proxy: SeqRepoDataProxy) -> AlleleTranslator: :return: singleton instance of ``AlleleTranslator`` """ if not hasattr(cls, "instance"): + # monkey patch to use cdot instead of UTA as HgvsTools data provider + HgvsTools.__init__ = init_hgvs_tools tr = AlleleTranslator(data_proxy) cls.instance = tr else: @@ -287,25 +322,25 @@ def _get_hgnc_symbol(term: str) -> str | None: return None -def get_gene_symbol(metadata: ScoresetMetadata) -> str | None: - """Acquire HGNC gene symbol given provided metadata from scoreset. +def get_gene_symbol(target_gene: TargetGene) -> str | None: + """Acquire HGNC gene symbol given provided target gene metadata from MaveDB. Right now, we use two sources for normalizing: 1. UniProt ID, if available 2. Target name: specifically, we try the first word in the name (this could cause some problems and we should double-check it) - :param ScoresetMetadata: data given by MaveDB API + :param target_gene: target gene metadata given by MaveDB API :return: gene symbol if available """ - if metadata.target_uniprot_ref: - result = _get_hgnc_symbol(metadata.target_uniprot_ref.id) + if target_gene.target_uniprot_ref: + result = _get_hgnc_symbol(target_gene.target_uniprot_ref.id) if result: return result # try taking the first word in the target name - if metadata.target_gene_name: - parsed_name = metadata.target_gene_name.split(" ")[0] + if target_gene.target_gene_name: + parsed_name = target_gene.target_gene_name.split(" ")[0] return _get_hgnc_symbol(parsed_name) return None @@ -324,21 +359,21 @@ def _normalize_gene(term: str) -> Gene | None: def _get_normalized_gene_response( - metadata: ScoresetMetadata, + target_gene: TargetGene, ) -> Gene | None: """Fetch best normalized concept given available scoreset metadata. :param metadata: salient scoreset metadata items :return: Normalized gene if available """ - if metadata.target_uniprot_ref: - gene_descriptor = _normalize_gene(metadata.target_uniprot_ref.id) + if target_gene.target_uniprot_ref: + gene_descriptor = _normalize_gene(target_gene.target_uniprot_ref.id) if gene_descriptor: return gene_descriptor # try taking the first word in the target name - if metadata.target_gene_name: - parsed_name = metadata.target_gene_name.split(" ")[0] + if target_gene.target_gene_name: + parsed_name = target_gene.target_gene_name.split(" ")[0] gene_descriptor = _normalize_gene(parsed_name) if gene_descriptor: return gene_descriptor @@ -371,7 +406,7 @@ def _get_genomic_interval( return None -def get_gene_location(metadata: ScoresetMetadata) -> GeneLocation | None: +def get_gene_location(target_gene: TargetGene) -> GeneLocation | None: """Acquire gene location data from gene normalizer using metadata provided by scoreset. @@ -380,10 +415,10 @@ def get_gene_location(metadata: ScoresetMetadata) -> GeneLocation | None: 2. Target name: specifically, we try the first word in the name (this could cause some problems and we should double-check it) - :param metadata: data given by MaveDB API + :param target_gene: data given by MaveDB API :return: gene location data if available """ - gene_descriptor = _get_normalized_gene_response(metadata) + gene_descriptor = _get_normalized_gene_response(target_gene) if not gene_descriptor or not gene_descriptor.extensions: return None @@ -426,6 +461,11 @@ def get_chromosome_identifier(chromosome: str) -> str: :return: latest ID if available :raise KeyError: if unable to retrieve identifier """ + # target sequence alignment references are chromosome names like ``"8"``, ``"X"`` + # but accession alignment information from cdot has reference accessions, beginning with "NC_" + # for "NC_" identifiers, just return the identifier + if chromosome.startswith("NC_"): + return chromosome if not chromosome.startswith("chr"): chromosome = f"chr{chromosome}" sr = get_seqrepo() diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 6909ed7..7ccf26c 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -8,7 +8,7 @@ import click from requests import HTTPError -from dcd_mapping.align import AlignmentError, BlatNotFoundError, align +from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result from dcd_mapping.annotate import ( annotate, save_mapped_output_json, @@ -33,7 +33,7 @@ ScoresetMetadata, VrsVersion, ) -from dcd_mapping.transcripts import TxSelectError, select_transcript +from dcd_mapping.transcripts import select_transcripts from dcd_mapping.vrs_map import VrsMapError, vrs_map _logger = logging.getLogger(__name__) @@ -138,7 +138,7 @@ async def _check_data_prereqs(silent: bool) -> None: async def map_scoreset( metadata: ScoresetMetadata, - records: list[ScoreRow], + records: dict[str, list[ScoreRow]], output_path: Path | None = None, vrs_version: VrsVersion = VrsVersion.V_2, prefer_genomic: bool = False, @@ -156,7 +156,8 @@ async def map_scoreset( _emit_info(f"Performing alignment for {metadata.urn}...", silent) try: - alignment_result = align(metadata, silent) + # dictionary where keys are target gene labels or accession ids, and values are alignment result objects + alignment_results = build_alignment_result(metadata, silent) except BlatNotFoundError as e: msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more." _emit_info(msg, silent, logging.ERROR) @@ -175,17 +176,8 @@ async def map_scoreset( ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) return - _emit_info("Alignment complete.", silent) - - _emit_info("Selecting reference sequence...", silent) - try: - transcript = await select_transcript(metadata, records, alignment_result) - except (TxSelectError, KeyError, ValueError) as e: - _emit_info( - f"Transcript selection failed for scoreset {metadata.urn}", - silent, - logging.ERROR, - ) + except ScoresetNotSupportedError as e: + _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( metadata.urn, ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), @@ -193,6 +185,16 @@ async def map_scoreset( ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) return + _emit_info("Alignment complete.", silent) + + _emit_info("Selecting reference sequence...", silent) + try: + transcripts = await select_transcripts(metadata, records, alignment_results) + # NOTE: transcript selection errors are handled in select_transcripts, + # and they do not cause the entire mapping process to exit; instead, an error will be reported + # on the target level and on the variant level for variants relative to that target + # HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate + # underlying issues with data providers. except HTTPError as e: _emit_info( f"HTTP error occurred during transcript selection: {e}", @@ -210,8 +212,16 @@ async def map_scoreset( _emit_info("Reference selection complete.", silent) _emit_info("Mapping to VRS...", silent) + vrs_results = {} try: - vrs_results = vrs_map(metadata, alignment_result, records, transcript, silent) + for target_gene in metadata.target_genes: + vrs_results[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=alignment_results[target_gene], + records=records[target_gene], + transcript=transcripts[target_gene], + silent=silent, + ) except VrsMapError as e: _emit_info( f"VRS mapping failed for scoreset {metadata.urn}", silent, logging.ERROR @@ -223,7 +233,9 @@ async def map_scoreset( ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) return - if vrs_results is None: + if not vrs_results or all( + mapping_result is None for mapping_result in vrs_results.values() + ): _emit_info(f"No mapping available for {metadata.urn}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( metadata.urn, @@ -238,20 +250,33 @@ async def map_scoreset( _emit_info("VRS mapping complete.", silent) _emit_info("Annotating metadata and saving to file...", silent) - try: - vrs_results = annotate(vrs_results, transcript, metadata, vrs_version) - except Exception as e: # TODO create AnnotationError class and replace ValueErrors in annotation steps with AnnotationErrors - _emit_info( - f"VRS annotation failed for scoreset {metadata.urn}", silent, logging.ERROR - ) - final_output = write_scoreset_mapping_to_json( - metadata.urn, - ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), - output_path, - ) - _emit_info(f"Score set mapping output saved to: {final_output}.", silent) - return - if vrs_results is None: + # annotate each target's variants separately, since annotation is target specific (e.g. obtaining reference sequence) + annotated_vrs_results = {} + for target_gene in vrs_results: + try: + annotated_vrs_results[target_gene] = annotate( + vrs_results[target_gene], + transcripts[target_gene], + metadata.target_genes[target_gene], + metadata.urn, + vrs_version, + ) + except Exception as e: + _emit_info( + f"VRS annotation failed for scoreset {metadata.urn}", + silent, + logging.ERROR, + ) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + return + if not annotated_vrs_results or all( + mapping_result is None for mapping_result in annotated_vrs_results.values() + ): _emit_info(f"No annotation available for {metadata.urn}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( metadata.urn, @@ -266,9 +291,9 @@ async def map_scoreset( try: final_output = save_mapped_output_json( metadata, - vrs_results, - alignment_result, - transcript, + annotated_vrs_results, + alignment_results, + transcripts, prefer_genomic, output_path, ) @@ -306,7 +331,7 @@ async def map_scoreset_urn( """ try: metadata = get_scoreset_metadata(urn, store_path) - records = get_scoreset_records(urn, silent, store_path) + records = get_scoreset_records(metadata, silent, store_path) except ScoresetNotSupportedError as e: _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 3654eb6..eadd421 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -17,6 +17,7 @@ from fastapi import HTTPException from pydantic import ValidationError +from dcd_mapping.lookup import DataLookupError from dcd_mapping.resource_utils import ( LOCAL_STORE_PATH, MAVEDB_BASE_URL, @@ -24,7 +25,13 @@ authentication_header, http_download, ) -from dcd_mapping.schemas import ScoreRow, ScoresetMapping, ScoresetMetadata, UniProtRef +from dcd_mapping.schemas import ( + ScoreRow, + ScoresetMapping, + ScoresetMetadata, + TargetGene, + UniProtRef, +) __all__ = [ "get_scoreset_urns", @@ -63,20 +70,24 @@ def get_scoreset_urns() -> set[str]: def _metadata_response_is_human(json_response: dict) -> bool: - """Check that response from scoreset metadata API refers to a human genome target. - + """Check that response from scoreset metadata API refers to a score set containing only human genome targets. :param json_response: response from scoreset metadata API :return: True if contains a target tagged as ``"Homo sapiens"`` """ for target_gene in json_response.get("targetGenes", []): + # for now, assume that genomic coordinate-based score sets are always human, + # since users are not allowed to upload non-human coordinate-based score sets + if target_gene.get("targetAccession"): + continue + organism = ( target_gene.get("targetSequence", {}) .get("taxonomy", {}) .get("organismName") ) - if organism == "Homo sapiens": - return True - return False + if organism != "Homo sapiens": + return False + return True def get_human_urns() -> list[str]: @@ -174,42 +185,63 @@ def get_scoreset_metadata( :raise ResourceAcquisitionError: if unable to acquire metadata """ metadata = get_raw_scoreset_metadata(scoreset_urn, dcd_mapping_dir) + target_genes = {} + multi_target = len(metadata["targetGenes"]) > 1 - if len(metadata["targetGenes"]) > 1: - msg = f"Multiple target genes for {scoreset_urn}. Multi-target score sets are not currently supported." - raise ScoresetNotSupportedError(msg) - gene = metadata["targetGenes"][0] - target_sequence_gene = gene.get("targetSequence") - if target_sequence_gene is None: - msg = f"No target sequence available for {scoreset_urn}. Accession-based score sets are not currently supported." - raise ScoresetNotSupportedError(msg) - if not _metadata_response_is_human(metadata): - msg = f"Experiment for {scoreset_urn} contains no human targets" - raise ScoresetNotSupportedError(msg) - try: - structured_data = ScoresetMetadata( - urn=metadata["urn"], - target_gene_name=gene["name"], - target_gene_category=gene["category"], - target_sequence=gene["targetSequence"]["sequence"], - target_sequence_type=gene["targetSequence"]["sequenceType"], - target_uniprot_ref=_get_uniprot_ref(metadata), - ) - except (KeyError, ValidationError) as e: - msg = f"Unable to extract metadata from API response for scoreset {scoreset_urn}: {e}" - _logger.error(msg) - raise ScoresetNotSupportedError(msg) from e + for gene in metadata["targetGenes"]: + if not _metadata_response_is_human(metadata): + msg = f"Experiment for {scoreset_urn} contains non-human targets" + raise ScoresetNotSupportedError(msg) + try: + target_gene_sequence = gene.get("targetSequence") + target_gene_accession = gene.get("targetAccession") + + if target_gene_sequence: + target_sequence_label = target_gene_sequence.get("label") + if target_sequence_label is None: + # if there are not multiple targets, label is not required by mavedb, + # so use target gene name as the label. + if not multi_target: + target_sequence_label = gene["name"] + else: + msg = f"No target label provided for target in multi-target score set {scoreset_urn}." + raise DataLookupError(msg) + target_genes[target_sequence_label] = TargetGene( + target_gene_name=gene["name"], + target_gene_category=gene["category"], + target_sequence=target_gene_sequence["sequence"], + target_sequence_type=target_gene_sequence["sequenceType"], + target_sequence_label=target_sequence_label, + target_uniprot_ref=_get_uniprot_ref(metadata), + ) + elif target_gene_accession: + target_accession_id = target_gene_accession["accession"] + target_genes[target_accession_id] = TargetGene( + target_gene_name=gene["name"], + target_gene_category=gene["category"], + target_accession_id=target_accession_id, + target_accession_assembly=target_gene_accession["assembly"], + ) + except (KeyError, ValidationError) as e: + msg = f"Unable to extract metadata from API response for scoreset {scoreset_urn}: {e}" + _logger.error(msg) + raise ScoresetNotSupportedError(msg) from e - return structured_data + return ScoresetMetadata(urn=scoreset_urn, target_genes=target_genes) -def _load_scoreset_records(path: Path) -> list[ScoreRow]: +def _load_scoreset_records( + path: Path, metadata: ScoresetMetadata +) -> dict[str, list[ScoreRow]]: """Load scoreset records from CSV file. + Organize scoreset records by reference sequence prefix / target gene label. + If no reference sequence prefix is provided, the score set should only have one + target, so use the one target's label. This method is intentionally identified as "private", but is refactored out for use during testing. """ - scores_data: list[ScoreRow] = [] + scores_data: dict[str, list[ScoreRow]] = {} with path.open() as csvfile: reader = csv.DictReader(csvfile) for row in reader: @@ -217,7 +249,27 @@ def _load_scoreset_records(path: Path) -> list[ScoreRow]: row["score"] = None else: row["score"] = row["score"] - scores_data.append(ScoreRow(**row)) + if row["hgvs_nt"] != "NA": + prefix = row["hgvs_nt"].split(":")[0] if ":" in row["hgvs_nt"] else None + elif row["hgvs_pro"] != "NA": + prefix = ( + row["hgvs_pro"].split(":")[0] if ":" in row["hgvs_pro"] else None + ) + else: + msg = f"Each score row in {metadata.urn} must contain hgvs_nt or hgvs_pro variant description " + raise ScoresetNotSupportedError(msg) + # If no reference sequence prefix is provided, the score set should only have one + # target, so use the one target's label. + if prefix is None: + if len(metadata.target_genes) == 1: + prefix = list(metadata.target_genes.keys())[0] # noqa: RUF015 + else: + msg = f"Score set {metadata.urn} contains one or more variant HGVS strings without a reference sequence label. All variant HGVS strings must contain a reference sequence label or accession ID unless the score set contains a single target sequence." + raise ScoresetNotSupportedError(msg) + if prefix in scores_data: + scores_data[prefix].append(ScoreRow(**row)) + else: + scores_data[prefix] = [ScoreRow(**row)] return scores_data @@ -238,8 +290,8 @@ def _get_experiment_53_scores(outfile: Path, silent: bool) -> None: def get_scoreset_records( - urn: str, silent: bool = True, dcd_mapping_dir: Path | None = None -) -> list[ScoreRow]: + metadata: ScoresetMetadata, silent: bool = True, dcd_mapping_dir: Path | None = None +) -> dict[str, list[ScoreRow]]: """Get scoreset records. Only hit the MaveDB API if unavailable locally. That means data must be refreshed @@ -255,13 +307,13 @@ def get_scoreset_records( """ if not dcd_mapping_dir: dcd_mapping_dir = LOCAL_STORE_PATH - scores_csv = dcd_mapping_dir / f"{urn}_scores.csv" + scores_csv = dcd_mapping_dir / f"{metadata.urn}_scores.csv" # TODO use smarter/more flexible caching methods if not scores_csv.exists(): - if urn == "urn:mavedb:00000053-a-1": + if metadata.urn == "urn:mavedb:00000053-a-1": _get_experiment_53_scores(scores_csv, silent) else: - url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{urn}/scores" + url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{metadata.urn}/scores" try: http_download(url, scores_csv, silent) except requests.HTTPError as e: @@ -269,7 +321,7 @@ def get_scoreset_records( _logger.error(msg) raise ResourceAcquisitionError(msg) from e - return _load_scoreset_records(scores_csv) + return _load_scoreset_records(scores_csv, metadata) def with_mavedb_score_set(fn: Callable) -> Callable: @@ -285,8 +337,8 @@ async def wrapper(*args, **kwargs) -> ScoresetMapping: # noqa: ANN002 # without the need to download the data again. temp_dir_as_path = Path(temp_dir) try: - get_scoreset_metadata(urn, temp_dir_as_path) - get_scoreset_records(urn, silent, temp_dir_as_path) + metadata = get_scoreset_metadata(urn, temp_dir_as_path) + get_scoreset_records(metadata, silent, temp_dir_as_path) except ScoresetNotSupportedError as e: return ScoresetMapping( metadata=None, diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 366943c..072e3b8 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -40,15 +40,24 @@ class UniProtRef(BaseModel): offset: int -class ScoresetMetadata(BaseModel): - """Store all relevant metadata from metadata reported for scoreset by MaveDB""" +class TargetGene(BaseModel): + """Store metadata for a target gene from a MaveDB score set""" - urn: str target_gene_name: str target_gene_category: TargetType - target_sequence: str - target_sequence_type: TargetSequenceType + target_sequence: str | None = None + target_sequence_type: TargetSequenceType | None = None + target_sequence_label: str | None = None target_uniprot_ref: UniProtRef | None = None + target_accession_id: str | None = None + target_accession_assembly: str | None = None + + +class ScoresetMetadata(BaseModel): + """Store all relevant metadata from metadata reported for scoreset by MaveDB""" + + urn: str + target_genes: dict[str, TargetGene] class ScoreRow(BaseModel): @@ -99,8 +108,8 @@ class AlignmentResult(BaseModel): chrom: str strand: Strand - coverage: float - ident_pct: float + coverage: float | None = None + ident_pct: float | None = None query_range: SequenceRange query_subranges: list[SequenceRange] hit_range: SequenceRange @@ -196,9 +205,14 @@ class ScoresetMapping(BaseModel): mapped_date_utc: str = Field( default=datetime.datetime.now(tz=datetime.UTC).isoformat() ) - computed_protein_reference_sequence: ComputedReferenceSequence | None = None - mapped_protein_reference_sequence: MappedReferenceSequence | None = None - computed_genomic_reference_sequence: ComputedReferenceSequence | None = None - mapped_genomic_reference_sequence: MappedReferenceSequence | None = None + reference_sequences: dict[ + str, + dict[ + AnnotationLayer, + dict[ + str, ComputedReferenceSequence | MappedReferenceSequence | dict | None + ], + ], + ] | None = None mapped_scores: list[ScoreAnnotation] | None = None error_message: str | None = None diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py index 56bea85..9034894 100644 --- a/src/dcd_mapping/transcripts.py +++ b/src/dcd_mapping/transcripts.py @@ -22,6 +22,7 @@ ManeDescription, ScoreRow, ScoresetMetadata, + TargetGene, TargetSequenceType, TargetType, TranscriptDescription, @@ -38,7 +39,7 @@ class TxSelectError(Exception): async def _get_compatible_transcripts( - metadata: ScoresetMetadata, align_result: AlignmentResult + target_gene: TargetGene, align_result: AlignmentResult ) -> list[list[str]]: """Acquire matching transcripts @@ -51,7 +52,7 @@ async def _get_compatible_transcripts( else: aligned_chrom = align_result.chrom chromosome = get_chromosome_identifier(aligned_chrom) - gene_symbol = get_gene_symbol(metadata) + gene_symbol = get_gene_symbol(target_gene) if not gene_symbol: raise TxSelectError transcript_matches = [] @@ -145,7 +146,7 @@ def _get_protein_sequence(target_sequence: str) -> str: async def _select_protein_reference( - metadata: ScoresetMetadata, align_result: AlignmentResult + target_gene: TargetGene, align_result: AlignmentResult ) -> TxSelectResult: """Select preferred transcript for protein reference sequence @@ -155,23 +156,21 @@ async def _select_protein_reference( :raise TxSelectError: if no matching MANE transcripts and unable to get UniProt ID/ reference sequence """ - matching_transcripts = await _get_compatible_transcripts(metadata, align_result) + matching_transcripts = await _get_compatible_transcripts(target_gene, align_result) if not matching_transcripts: common_transcripts = None else: common_transcripts = _reduce_compatible_transcripts(matching_transcripts) if not common_transcripts: - if not metadata.target_uniprot_ref: - msg = f"Unable to find matching transcripts for {metadata.urn}" + if not target_gene.target_uniprot_ref: + msg = f"Unable to find matching transcripts for target gene {target_gene.target_gene_name}" raise TxSelectError(msg) - protein_sequence = get_uniprot_sequence(metadata.target_uniprot_ref.id) - np_accession = metadata.target_uniprot_ref.id - ref_sequence = get_uniprot_sequence(metadata.target_uniprot_ref.id) + protein_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id) + np_accession = target_gene.target_uniprot_ref.id + ref_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id) if not ref_sequence: - msg = ( - f"Unable to grab reference sequence from uniprot.org for {metadata.urn}" - ) - raise ValueError(msg) + msg = f"Unable to grab reference sequence from uniprot.org for target gene {target_gene.target_gene_name}" + raise TxSelectError(msg) nm_accession = None tx_mode = None else: @@ -186,8 +185,7 @@ async def _select_protein_reference( np_accession = best_tx.refseq_prot tx_mode = best_tx.transcript_priority - protein_sequence = _get_protein_sequence(metadata.target_sequence) - # TODO -- look at these two lines + protein_sequence = _get_protein_sequence(target_gene.target_sequence) is_full_match = ref_sequence.find(protein_sequence) != -1 start = ref_sequence.find(protein_sequence[:10]) @@ -201,10 +199,10 @@ async def _select_protein_reference( ) -def _offset_target_sequence(metadata: ScoresetMetadata, records: list[ScoreRow]) -> int: +def _offset_target_sequence(target_gene: TargetGene, records: list[ScoreRow]) -> int: """Find start location in target sequence - :param metadata: MaveDB metadata for score set + :param target_gene: MaveDB metadata for target gene :param records: individual score records (including MAVE-HGVS descriptions) :return: starting index position (may be 0) """ @@ -232,7 +230,7 @@ def _offset_target_sequence(metadata: ScoresetMetadata, records: list[ScoreRow]) amino_acids_by_position[loc] = seq1(aa) err_locs = [] - protein_sequence = Seq(metadata.target_sequence).translate(table="1") + protein_sequence = Seq(target_gene.target_sequence).translate(table="1") for i in range(len(protein_sequence)): if ( str(i) in amino_acids_by_position @@ -251,7 +249,7 @@ def _offset_target_sequence(metadata: ScoresetMetadata, records: list[ScoreRow]) for value in amino_acids_by_position.values(): seq += value - protein_sequence = _get_protein_sequence(metadata.target_sequence) + protein_sequence = _get_protein_sequence(target_gene.target_sequence) offset = 0 if protein_sequence in seq: @@ -308,22 +306,24 @@ def _handle_edge_cases( async def select_transcript( - metadata: ScoresetMetadata, + scoreset_urn: str, + target_gene: TargetGene, records: list[ScoreRow], align_result: AlignmentResult, ) -> TxSelectResult | None: - """Select appropriate human reference sequence for scoreset. + """Select appropriate human reference sequence for one target in a score set. - * Unnecessary for regulatory/other noncoding scoresets which report genomic + * Unnecessary for regulatory/other noncoding targets in score sets which report genomic variations. - * For protein scoresets, identify a matching RefSeq protein reference sequence. + * For protein-coding targets, identify a matching RefSeq protein reference sequence. - :param metadata: Scoreset metadata from MaveDB + :param scoreset_urn: MaveDB URN for score set, used for hardcoding for certain score sets + :param target_gene: Target gene metadata from MaveDB :param records: :param align_result: alignment results :return: Transcript description (accession ID, offset, selected sequence, etc) """ - if metadata.urn.startswith("urn:mavedb:00000097"): + if scoreset_urn.startswith("urn:mavedb:00000097"): _logger.info( "Score sets in urn:mavedb:00000097 are already expressed in full HGVS strings -- using predefined results because additional hard-coding is unnecessary" ) @@ -333,16 +333,67 @@ async def select_transcript( start=0, is_full_match=False, transcript_mode=TranscriptPriority.MANE_SELECT, - sequence=_get_protein_sequence(metadata.target_sequence), + sequence=_get_protein_sequence(target_gene.target_sequence), ) - if metadata.target_gene_category != TargetType.PROTEIN_CODING: - _logger.debug("%s is regulatory/noncoding -- skipping transcript selection") + if target_gene.target_gene_category != TargetType.PROTEIN_CODING: + _logger.debug( + "%s is regulatory/noncoding -- skipping transcript selection", + target_gene.target_gene_name, + ) return None - transcript_reference = await _select_protein_reference(metadata, align_result) - if transcript_reference and metadata.target_sequence_type == TargetSequenceType.DNA: - offset = _offset_target_sequence(metadata, records) + transcript_reference = await _select_protein_reference(target_gene, align_result) + if ( + transcript_reference + and target_gene.target_sequence_type == TargetSequenceType.DNA + ): + offset = _offset_target_sequence(target_gene, records) if offset: transcript_reference.start = offset - return _handle_edge_cases(metadata.urn, transcript_reference) + return _handle_edge_cases(scoreset_urn, transcript_reference) + + +async def select_transcripts( + scoreset_metadata: ScoresetMetadata, + records: dict[str, list[ScoreRow]], + align_results: dict[str, AlignmentResult | None], +) -> dict[str, TxSelectResult | Exception | None]: + """Select appropriate human reference sequence for each target in a score set. + :param scoreset_metadata: Metadata for score set from MaveDB API + :param records: Variant/score records from MaveDB API + :param align_results: Alignment results for all targets in score set. + * Dict where keys are target labels and values are alignment result objects + :return: dict where keys are target labels and values are objects describing selected transcript (accession ID, offset, selected sequence, etc) + """ + selected_transcripts = {} + for target_gene in scoreset_metadata.target_genes: + if scoreset_metadata.target_genes[target_gene].target_accession_id: + # for accession-based targets, create tx select objects for protein sequence accessions only + accession_id = scoreset_metadata.target_genes[ + target_gene + ].target_accession_id + # TODO create full list of possible protein accession prefixes + if accession_id.startswith(("NP_", "ENSP_")): + # TODO make sequence field optional instead of leaving blank here? + selected_transcripts[target_gene] = TxSelectResult( + np=accession_id, + start=0, + is_full_match=True, + sequence="", + transcript_mode=None, + ) + else: + selected_transcripts[target_gene] = None + else: + try: + selected_transcripts[target_gene] = await select_transcript( + scoreset_urn=scoreset_metadata.urn, + target_gene=scoreset_metadata.target_genes[target_gene], + records=records[target_gene], + align_result=align_results[target_gene], + ) + except (TxSelectError, KeyError) as e: + selected_transcripts[target_gene] = e + + return selected_transcripts diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py index e095212..8c31b0b 100644 --- a/src/dcd_mapping/version.py +++ b/src/dcd_mapping/version.py @@ -1,3 +1,3 @@ """Provide dcd mapping version""" -dcd_mapping_version = "2024.1.3" +dcd_mapping_version = "2025.1.0" diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index 4f4a32f..b03742c 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -5,6 +5,7 @@ from itertools import cycle from Bio.Seq import Seq +from bioutils.accessions import infer_namespace from cool_seq_tool.schemas import AnnotationLayer, Strand from ga4gh.core import ga4gh_identify, sha512t24u from ga4gh.vrs._internal.models import ( @@ -20,6 +21,7 @@ from mavehgvs.variant import Variant from dcd_mapping.lookup import ( + cdot_rest, get_chromosome_identifier, get_seqrepo, translate_hgvs_to_vrs, @@ -28,11 +30,12 @@ AlignmentResult, MappedScore, ScoreRow, - ScoresetMetadata, + TargetGene, TargetSequenceType, TargetType, TxSelectResult, ) +from dcd_mapping.transcripts import TxSelectError __all__ = ["vrs_map", "VrsMapError"] @@ -67,11 +70,26 @@ def _process_any_aa_code(hgvs_pro_string: str) -> str: return hgvs_pro_string +def is_intronic_variant(variant: Variant) -> bool: + """Return True if given Variant is intronic, otherwise return False. + Supports single or multi-position variants. + """ + if isinstance(variant.positions, Iterable): + if any(position.is_intronic() for position in variant.positions): + return True + else: + if variant.positions.is_intronic(): + return True + + return False + + def _create_pre_mapped_hgvs_strings( raw_description: str, layer: AnnotationLayer, tx: TxSelectResult | None = None, alignment: AlignmentResult | None = None, + accession_id: str | None = None, ) -> list[str]: """Generate a list of (pre-mapped) HGVS strings from one long string containing many valid HGVS substrings @@ -84,13 +102,14 @@ def _create_pre_mapped_hgvs_strings( :param layer: An enum denoting the targeted annotation layer of these HGVS strings :param tx: A TxSelectResult object defining the transcript we are mapping to (or None). :param alignment: An AlignmentResult object defining the alignment we are mapping to (or None). + :param accession_id: An accession id describing the reference sequence (for accession-based target gene variants) :return: A list of HGVS strings prior to being mapped to the `tx` or `alignment` """ if layer is AnnotationLayer.PROTEIN and tx is None: msg = f"Transcript result must be provided for {layer} annotations (Transcript was `{tx}`)." raise ValueError(msg) - if layer is AnnotationLayer.GENOMIC and alignment is None: - msg = f"Alignment result must be provided for {layer} annotations (Alignment was `{alignment}`)." + if layer is AnnotationLayer.GENOMIC and alignment is None and accession_id is None: + msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}`)." raise ValueError(msg) raw_variant_strings = _parse_raw_variant_str(raw_description) @@ -102,10 +121,18 @@ def _create_pre_mapped_hgvs_strings( msg = f"Variant could not be parsed by mavehgvs: {error}" raise ValueError(msg) + # ga4gh hgvs_tools does not support intronic variants, so they will err out when vrs allele translator is called + # therefore skip them there + if is_intronic_variant(variant): + msg = f"Variant is intronic and cannot be processed: {variant}" + raise ValueError(msg) + + if accession_id: + hgvs_strings.append(accession_id + ":" + str(variant)) # Ideally we would create an HGVS string namespaced to GA4GH. The line below # creates such a string, but it is not able to be parsed by the GA4GH VRS translator. # hgvs_strings.append('ga4gh:' + sequence_id + ':' + str(variant)) - if layer is AnnotationLayer.PROTEIN: + elif layer is AnnotationLayer.PROTEIN: assert tx # noqa: S101. mypy help hgvs_strings.append(tx.np + ":" + str(variant)) elif layer is AnnotationLayer.GENOMIC: @@ -156,6 +183,12 @@ def _create_post_mapped_hgvs_strings( msg = f"Variant could not be parsed by mavehgvs: {error}" raise ValueError(msg) + # ga4gh hgvs_tools does not support intronic variants, so they will err out when vrs allele translator is called + # therefore skip them there + if is_intronic_variant(variant): + msg = f"Variant is intronic and cannot be processed: {variant}" + raise ValueError(msg) + if layer is AnnotationLayer.PROTEIN: assert tx # noqa: S101. mypy help @@ -280,6 +313,12 @@ def _parse_raw_variant_str(raw_description: str) -> list[str]: :param raw_description: A string that may contain a list of variant descriptions or a single variant description :return: A list of HGVS strings """ + # some variant strings follow mavehgvs format, meaning they don't have a reference sequence id and colon preceding the c./g./n./p. prefix + # the reference sequence information has previously been parsed for score sets with multiple targets, + # so can discard the reference sequence id and colon if they are present + # TODO check assumption of no colon unless reference sequence identifier is supplied! + if ":" in raw_description: + raw_description = raw_description.split(":")[1] if "[" in raw_description: prefix = raw_description[0:2] return [prefix + var for var in set(raw_description[3:-1].split(";"))] @@ -397,7 +436,7 @@ def _map_protein_coding_pro( def _map_genomic( row: ScoreRow, sequence_id: str, - align_result: AlignmentResult, + align_result: AlignmentResult | None, ) -> MappedScore: """Construct VRS object mapping for ``hgvs_nt`` variant column entry @@ -408,6 +447,15 @@ def _map_genomic( :param align_result: The transcript selection information for a score set :return: VRS mapping object if mapping succeeds """ + namespace = infer_namespace(sequence_id) + if namespace is None: + if sequence_id.startswith("SQ."): + # if the sequence id starts with SQ, it is a target sequence which is in the ga4gh namespace + namespace = "ga4gh" + else: + msg = f"Namespace could not be inferred from sequence: {sequence_id}" + raise ValueError(msg) + if ( row.hgvs_nt in {"_wt", "_sy", "="} or "fs" @@ -423,55 +471,146 @@ def _map_genomic( error_message=f"Can't process variant syntax {row.hgvs_nt}", ) - try: - pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( - row.hgvs_nt, - AnnotationLayer.GENOMIC, - alignment=align_result, - ) - pre_mapped_genomic = _construct_vrs_allele( - pre_mapped_hgvs_strings, - AnnotationLayer.GENOMIC, - sequence_id, - True, - ) - except Exception as e: - _logger.warning( - "An error occurred while generating pre-mapped genomic variant for %s, accession %s: %s", - row.hgvs_nt, - row.accession, - str(e), - ) - return MappedScore( - accession_id=row.accession, score=row.score, error_message=str(e) - ) + if align_result is None: + # for contig accession based score sets, no mapping is performed, + # so pre- and post-mapped alleles are the same + try: + pre_mapped_hgvs_strings = ( + post_mapped_hgvs_strings + ) = _create_pre_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + accession_id=sequence_id, + ) + # accession-based pre-mapped alleles should be constructed like post-mapped alleles (sequence id is gathered from hgvs string rather than manually provided) + pre_mapped_genomic = _construct_vrs_allele( + pre_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + post_mapped_genomic = _construct_vrs_allele( + post_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, score=row.score, error_message=str(e) + ) - try: - post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( - row.hgvs_nt, - AnnotationLayer.GENOMIC, - alignment=align_result, - ) - post_mapped_genomic = _construct_vrs_allele( - post_mapped_hgvs_strings, - AnnotationLayer.GENOMIC, - None, - False, - ) - except Exception as e: - _logger.warning( - "An error occurred while generating post-mapped genomic variant for %s, accession %s: %s", - row.hgvs_nt, - row.accession, - str(e), - ) - return MappedScore( - accession_id=row.accession, - score=row.score, - annotation_layer=AnnotationLayer.GENOMIC, - pre_mapped=pre_mapped_genomic, - error_message=str(e), - ) + elif namespace.lower() in ("refseq", "ncbi", "ensembl"): + # nm/enst way + try: + pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + accession_id=sequence_id, + ) + # accession-based pre-mapped alleles should be constructed like post-mapped alleles (sequence id is gathered from hgvs string rather than manually provided) + pre_mapped_genomic = _construct_vrs_allele( + pre_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating pre-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, score=row.score, error_message=str(e) + ) + try: + post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + alignment=align_result, + ) + post_mapped_genomic = _construct_vrs_allele( + post_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating post-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, + score=row.score, + annotation_layer=AnnotationLayer.GENOMIC, + pre_mapped=pre_mapped_genomic, + error_message=str(e), + ) + elif namespace.lower() == "ga4gh": + # target seq way + try: + pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + alignment=align_result, + ) + pre_mapped_genomic = _construct_vrs_allele( + pre_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + sequence_id, + True, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating pre-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, score=row.score, error_message=str(e) + ) + + try: + post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + alignment=align_result, + ) + post_mapped_genomic = _construct_vrs_allele( + post_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating post-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + str(e), + ) + return MappedScore( + accession_id=row.accession, + score=row.score, + annotation_layer=AnnotationLayer.GENOMIC, + pre_mapped=pre_mapped_genomic, + error_message=str(e), + ) + else: + msg = f"Reference sequence namespace not supported: {namespace}" + raise ValueError(msg) return MappedScore( accession_id=row.accession, @@ -526,17 +665,30 @@ def _hgvs_nt_is_valid(hgvs_nt: str) -> bool: ) +def _hgvs_pro_is_valid(hgvs_pro: str) -> bool: + """Check for invalid or unavailable protein MAVE-HGVS variation + + :param hgvs_nt: MAVE_HGVS protein expression + :return: True if expression appears populated and valid + """ + return ( + (hgvs_pro not in {"_wt", "_sy", "NA"}) + and (len(hgvs_pro) != 3) + and ("fs" not in hgvs_pro) + ) + + def _map_protein_coding( - metadata: ScoresetMetadata, + metadata: TargetGene, records: list[ScoreRow], - transcript: TxSelectResult, + transcript: TxSelectResult | TxSelectError, align_result: AlignmentResult, ) -> list[MappedScore]: """Perform mapping on protein coding experiment results - :param metadata: The metadata for a score set + :param metadata: Target gene metadata from MaveDB API :param records: The list of MAVE variants in a given score set - :param transcript: The transcript data for a score set + :param transcript: The transcript data for a score set, or an error message describing why an expected transcript is missing :param align_results: The alignment data for a score set :return: A list of mappings """ @@ -552,27 +704,50 @@ def _map_protein_coding( variations: list[MappedScore] = [] for row in records: - hgvs_pro_mappings = _map_protein_coding_pro(row, psequence_id, transcript) - if hgvs_pro_mappings: - variations.append(hgvs_pro_mappings) - + hgvs_nt_mappings = None + hgvs_pro_mappings = None if _hgvs_nt_is_valid(row.hgvs_nt): hgvs_nt_mappings = _map_genomic(row, gsequence_id, align_result) - if hgvs_nt_mappings: - variations.append(hgvs_nt_mappings) - + if ( + isinstance(transcript, TxSelectError) and not hgvs_nt_mappings + ): # only create error message if there is not an hgvs nt mapping + # TODO create pre mapped allele + hgvs_pro_mappings = MappedScore( + accession_id=row.accession, + score=row.score, + error_message=str(transcript).strip("'"), + ) + else: + if _hgvs_pro_is_valid(row.hgvs_pro): + hgvs_pro_mappings = _map_protein_coding_pro( + row, psequence_id, transcript + ) + elif ( + not hgvs_nt_mappings + ): # only create error message if there is not an hgvs nt mapping + hgvs_pro_mappings = MappedScore( + accession_id=row.accession, + score=row.score, + error_message="Invalid protein variant syntax", + ) + + # append both pro and nt mappings if both available + if hgvs_pro_mappings: + variations.append(hgvs_pro_mappings) + if hgvs_nt_mappings: + variations.append(hgvs_nt_mappings) return variations def _map_regulatory_noncoding( - metadata: ScoresetMetadata, + metadata: TargetGene, records: list[ScoreRow], align_result: AlignmentResult, ) -> list[MappedScore]: """Perform mapping on noncoding/regulatory experiment results - :param metadata: metadata for URN + :param metadata: Target gene metadata from MaveDB API :param records: list of MAVE experiment result rows :param align_result: An AlignmentResult object for a score set :return: A list of VRS mappings @@ -587,6 +762,52 @@ def _map_regulatory_noncoding( return variations +def store_accession( + accession_id: str, +) -> None: + namespace = infer_namespace(accession_id) + alias_dict_list = [{"namespace": namespace, "alias": accession_id}] + cd = cdot_rest() + sequence = cd.get_seq(accession_id) + sr = get_seqrepo() + sr.sr.store(sequence, alias_dict_list) + + +def _map_accession( + metadata: TargetGene, + records: list[ScoreRow], + align_result: AlignmentResult + | None, # NP and NC accessions won't have alignment results + transcript: TxSelectResult | None, +) -> list[MappedScore]: + variations: list[MappedScore] = [] + sequence_id = metadata.target_accession_id + if sequence_id is None: + raise ValueError + + store_accession(sequence_id) + + # TODO full list of protein accession id prefixes + if metadata.target_accession_id.startswith(("NP", "ENSP")): + for row in records: + hgvs_pro_mappings = _map_protein_coding_pro( + row, + sequence_id, + transcript, + ) + variations.append(hgvs_pro_mappings) + # TODO full list of transcript and contig accession id prefixes + elif metadata.target_accession_id.startswith(("NM", "ENST", "NC")): + for row in records: + hgvs_nt_mappings = _map_genomic(row, sequence_id, align_result) + variations.append(hgvs_nt_mappings) + else: + msg = f"Unrecognized accession prefix for accession id {metadata.target_accession_id}" + raise ValueError(msg) + + return variations + + def _rle_to_lse( rle: ReferenceLengthExpression, location: SequenceLocation ) -> LiteralSequenceExpression: @@ -661,22 +882,24 @@ def _construct_vrs_allele( def vrs_map( - metadata: ScoresetMetadata, - align_result: AlignmentResult, + metadata: TargetGene, + align_result: AlignmentResult | None, records: list[ScoreRow], - transcript: TxSelectResult | None = None, + transcript: TxSelectResult | TxSelectError | None = None, silent: bool = True, ) -> list[MappedScore] | None: """Given a description of a MAVE scoreset and an aligned transcript, generate the corresponding VRS objects. - :param metadata: salient MAVE scoreset metadata + :param metadata: target gene metadata from MaveDB API :param align_result: output from the sequence alignment process :param records: scoreset records :param transcript: output of transcript selection process :param silent: If true, suppress console output :return: A list of mapping results """ + if metadata.target_accession_id: + return _map_accession(metadata, records, align_result, transcript) if metadata.target_gene_category == TargetType.PROTEIN_CODING and transcript: return _map_protein_coding( metadata, diff --git a/tests/conftest.py b/tests/conftest.py index a9ce550..e18a0e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,8 +18,14 @@ from unittest.mock import MagicMock import pytest +from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher -from dcd_mapping.schemas import AlignmentResult, ScoresetMetadata, TxSelectResult +from dcd_mapping.schemas import ( + AlignmentResult, + ScoresetMetadata, + TargetGene, + TxSelectResult, +) FIXTURE_DATA_DIR = Path(__file__).parents[0].resolve() / "fixtures" @@ -43,7 +49,11 @@ def scoreset_metadata_fixture(fixture_data_dir: Path): data = json.load(f) results = {} for d in data["scoreset_metadata"]: - formatted_data = ScoresetMetadata(**d) + target_genes = {} + for target_gene in d["target_genes"]: + target_gene_metadata = d["target_genes"][target_gene] + target_genes[target_gene] = TargetGene(**target_gene_metadata) + formatted_data = ScoresetMetadata(urn=d["urn"], target_genes=target_genes) results[formatted_data.urn] = formatted_data return results @@ -56,8 +66,10 @@ def align_result_fixture(fixture_data_dir: Path): data = json.load(f) results = {} for urn, result in data.items(): - formatted_result = AlignmentResult(**result) - results[urn] = formatted_result + formatted_results = {} + for target_gene in result: + formatted_results[target_gene] = AlignmentResult(**result[target_gene]) + results[urn] = formatted_results return results @@ -69,8 +81,10 @@ def transcript_results_fixture(fixture_data_dir: Path): data = json.load(f) results = {} for urn, result in data.items(): - formatted_result = TxSelectResult(**result) - results[urn] = formatted_result + formatted_results = {} + for target_gene in result: + formatted_results[target_gene] = TxSelectResult(**result[target_gene]) + results[urn] = formatted_results return results @@ -203,3 +217,18 @@ def _translate_identifier( mocker.patch("dcd_mapping.vrs_map.get_seqrepo", return_value=mock_seqrepo_access) mocker.patch("dcd_mapping.lookup.get_seqrepo", return_value=mock_seqrepo_access) return mock_seqrepo_access + + +@pytest.fixture() +def data_provider(fixture_data_dir: Path): + """Provide a ChainedSeqFetcher with mocked fasta files for testing. + Currently, no tests actually use cdot, but a FileNotFoundError would be raised + without this fixture when cdot is imported into dcd_mapping.vrs_map. + """ + # NOTE: This fasta file would not work for test cases that actually use cdot fetching, + # it is only meant to avoid a FileNotFoundError. + test_fasta_file = fixture_data_dir / "test.fasta" + + seqfetcher = ChainedSeqFetcher(FastaSeqFetcher(test_fasta_file)) + + yield seqfetcher # noqa: PT022 diff --git a/tests/fixtures/align_result.json b/tests/fixtures/align_result.json index 306d755..6b2d086 100644 --- a/tests/fixtures/align_result.json +++ b/tests/fixtures/align_result.json @@ -1,626 +1,614 @@ { "urn:mavedb:00000002-a-2": { - "chrom": "chr11", - "strand": 1, - "coverage": 100.0, - "ident_pct": 77.45098039215686, - "query_range": { - "start": 0, - "end": 102 - }, - "query_subranges": [ - { + "hYAP65 WW domain": { + "chrom": "chr11", + "strand": 1, + "coverage": 100.0, + "ident_pct": 77.45098039215686, + "query_range": { "start": 0, - "end": 60 - }, - { - "start": 60, "end": 102 - } - ], - "hit_range": { - "start": 102114329, - "end": 102162492 - }, - "hit_subranges": [ - { - "start": 102114329, - "end": 102114389 }, - { - "start": 102162450, - "end": 102162492 - } - ] - }, - "urn:mavedb:00000007-a-2": { - "chrom": "chr11", - "strand": 1, - "coverage": 100.0, - "ident_pct": 77.45098039215686, - "query_range": { - "start": 0, - "end": 102 - }, - "query_subranges": [ - { - "start": 0, - "end": 60 - }, - { - "start": 60, - "end": 102 - } - ], - "hit_range": { - "start": 102114329, - "end": 102162492 - }, - "hit_subranges": [ - { + "query_subranges": [ + { + "start": 0, + "end": 60 + }, + { + "start": 60, + "end": 102 + } + ], + "hit_range": { "start": 102114329, - "end": 102114389 - }, - { - "start": 102162450, "end": 102162492 - } - ] + }, + "hit_subranges": [ + { + "start": 102114329, + "end": 102114389 + }, + { + "start": 102162450, + "end": 102162492 + } + ] + } }, "urn:mavedb:00000099-a-1": { - "chrom": "chr3", - "strand": 1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 1047 - }, - "query_subranges": [ - { + "RHO": { + "chrom": "chr3", + "strand": 1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, - "end": 361 - }, - { - "start": 361, - "end": 530 - }, - { - "start": 530, - "end": 696 - }, - { - "start": 696, - "end": 936 - }, - { - "start": 936, "end": 1047 - } - ], - "hit_range": { - "start": 129528733, - "end": 129533718 - }, - "hit_subranges": [ - { - "start": 129528733, - "end": 129529094 - }, - { - "start": 129530875, - "end": 129531044 - }, - { - "start": 129532250, - "end": 129532416 - }, - { - "start": 129532532, - "end": 129532772 }, - { - "start": 129533607, + "query_subranges": [ + { + "start": 0, + "end": 361 + }, + { + "start": 361, + "end": 530 + }, + { + "start": 530, + "end": 696 + }, + { + "start": 696, + "end": 936 + }, + { + "start": 936, + "end": 1047 + } + ], + "hit_range": { + "start": 129528733, "end": 129533718 - } - ] + }, + "hit_subranges": [ + { + "start": 129528733, + "end": 129529094 + }, + { + "start": 129530875, + "end": 129531044 + }, + { + "start": 129532250, + "end": 129532416 + }, + { + "start": 129532532, + "end": 129532772 + }, + { + "start": 129533607, + "end": 129533718 + } + ] + } }, "urn:mavedb:00000103-c-1": { - "chrom": "chr22", - "strand": -1, - "coverage": 100.0, - "ident_pct": 99.9071494893222, - "query_range": { - "start": 0, - "end": 360 - }, - "query_subranges": [ - { + "MAPK1": { + "chrom": "chr22", + "strand": -1, + "coverage": 100.0, + "ident_pct": 99.9071494893222, + "query_range": { "start": 0, - "end": 39 - }, - { - "start": 40, - "end": 101 - }, - { - "start": 101, - "end": 164 - }, - { - "start": 164, - "end": 203 - }, - { - "start": 203, - "end": 241 - }, - { - "start": 241, - "end": 285 - }, - { - "start": 285, - "end": 322 - }, - { - "start": 322, "end": 360 - } - ], - "hit_range": { - "start": 21769206, - "end": 21867440 - }, - "hit_subranges": [ - { - "start": 21867323, - "end": 21867440 - }, - { - "start": 21807662, - "end": 21807845 - }, - { - "start": 21805849, - "end": 21806038 - }, - { - "start": 21799011, - "end": 21799128 - }, - { - "start": 21788694, - "end": 21788808 - }, - { - "start": 21788257, - "end": 21788389 - }, - { - "start": 21772872, - "end": 21772983 }, - { + "query_subranges": [ + { + "start": 0, + "end": 39 + }, + { + "start": 40, + "end": 101 + }, + { + "start": 101, + "end": 164 + }, + { + "start": 164, + "end": 203 + }, + { + "start": 203, + "end": 241 + }, + { + "start": 241, + "end": 285 + }, + { + "start": 285, + "end": 322 + }, + { + "start": 322, + "end": 360 + } + ], + "hit_range": { "start": 21769206, - "end": 21769320 - } - ] + "end": 21867440 + }, + "hit_subranges": [ + { + "start": 21867323, + "end": 21867440 + }, + { + "start": 21807662, + "end": 21807845 + }, + { + "start": 21805849, + "end": 21806038 + }, + { + "start": 21799011, + "end": 21799128 + }, + { + "start": 21788694, + "end": 21788808 + }, + { + "start": 21788257, + "end": 21788389 + }, + { + "start": 21772872, + "end": 21772983 + }, + { + "start": 21769206, + "end": 21769320 + } + ] + } }, "urn:mavedb:00000041-a-1": { - "chrom": "chr20", - "strand": 1, - "coverage": 100.0, - "ident_pct": 99.86666666666666, - "query_range": { - "start": 0, - "end": 750 - }, - "query_subranges": [ - { + "Src catalytic domain": { + "chrom": "chr20", + "strand": 1, + "coverage": 100.0, + "ident_pct": 99.86666666666666, + "query_range": { "start": 0, - "end": 52 - }, - { - "start": 52, - "end": 232 - }, - { - "start": 232, - "end": 309 - }, - { - "start": 309, - "end": 463 - }, - { - "start": 463, - "end": 595 - }, - { - "start": 595, "end": 750 - } - ], - "hit_range": { - "start": 37397802, - "end": 37403325 - }, - "hit_subranges": [ - { - "start": 37397802, - "end": 37397854 - }, - { - "start": 37400114, - "end": 37400294 - }, - { - "start": 37401601, - "end": 37401678 }, - { - "start": 37402434, - "end": 37402588 - }, - { - "start": 37402748, - "end": 37402880 - }, - { - "start": 37403170, + "query_subranges": [ + { + "start": 0, + "end": 52 + }, + { + "start": 52, + "end": 232 + }, + { + "start": 232, + "end": 309 + }, + { + "start": 309, + "end": 463 + }, + { + "start": 463, + "end": 595 + }, + { + "start": 595, + "end": 750 + } + ], + "hit_range": { + "start": 37397802, "end": 37403325 - } - ] + }, + "hit_subranges": [ + { + "start": 37397802, + "end": 37397854 + }, + { + "start": 37400114, + "end": 37400294 + }, + { + "start": 37401601, + "end": 37401678 + }, + { + "start": 37402434, + "end": 37402588 + }, + { + "start": 37402748, + "end": 37402880 + }, + { + "start": 37403170, + "end": 37403325 + } + ] + } }, "urn:mavedb:00000018-a-1": { - "chrom": "chr11", - "strand": 1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 187 - }, - "query_subranges": [ - { + "HBB promoter": { + "chrom": "chr11", + "strand": 1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, "end": 187 - } - ], - "hit_range": { - "start": 5227021, - "end": 5227208 - }, - "hit_subranges": [ - { + }, + "query_subranges": [ + { + "start": 0, + "end": 187 + } + ], + "hit_range": { "start": 5227021, "end": 5227208 - } - ] + }, + "hit_subranges": [ + { + "start": 5227021, + "end": 5227208 + } + ] + } }, "urn:mavedb:00000001-a-4": { - "chrom": "chr16", - "strand": 1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 477 - }, - "query_subranges": [ - { + "UBE2I": { + "chrom": "chr16", + "strand": 1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, - "end": 66 - }, - { - "start": 66, - "end": 150 - }, - { - "start": 150, - "end": 223 - }, - { - "start": 223, - "end": 333 - }, - { - "start": 333, - "end": 413 - }, - { - "start": 413, "end": 477 - } - ], - "hit_range": { - "start": 1314030, - "end": 1324793 - }, - "hit_subranges": [ - { - "start": 1314030, - "end": 1314096 - }, - { - "start": 1314292, - "end": 1314376 - }, - { - "start": 1315653, - "end": 1315726 - }, - { - "start": 1320173, - "end": 1320283 - }, - { - "start": 1320437, - "end": 1320517 }, - { - "start": 1324729, + "query_subranges": [ + { + "start": 0, + "end": 66 + }, + { + "start": 66, + "end": 150 + }, + { + "start": 150, + "end": 223 + }, + { + "start": 223, + "end": 333 + }, + { + "start": 333, + "end": 413 + }, + { + "start": 413, + "end": 477 + } + ], + "hit_range": { + "start": 1314030, "end": 1324793 - } - ] + }, + "hit_subranges": [ + { + "start": 1314030, + "end": 1314096 + }, + { + "start": 1314292, + "end": 1314376 + }, + { + "start": 1315653, + "end": 1315726 + }, + { + "start": 1320173, + "end": 1320283 + }, + { + "start": 1320437, + "end": 1320517 + }, + { + "start": 1324729, + "end": 1324793 + } + ] + } }, "urn:mavedb:00000098-a-1": { - "chrom": "chr3", - "strand": -1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 12 - }, - "query_subranges": [ - { + "SCN5A": { + "chrom": "chr3", + "strand": -1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, "end": 12 - } - ], - "hit_range": { - "start": 38551475, - "end": 38551511 - }, - "hit_subranges": [ - { + }, + "query_subranges": [ + { + "start": 0, + "end": 12 + } + ], + "hit_range": { "start": 38551475, "end": 38551511 - } - ] + }, + "hit_subranges": [ + { + "start": 38551475, + "end": 38551511 + } + ] + } }, "urn:mavedb:00000061-h-1": { - "chrom": "chr3", - "strand": -1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 117 - }, - "query_subranges": [ - { - "start": 54, - "end": 117 - }, - { + "RAF": { + "chrom": "chr3", + "strand": -1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { "start": 0, - "end": 54 - } - ], - "hit_range": { - "start": 12611999, - "end": 12618568 - }, - "hit_subranges": [ - { - "start": 1261199, - "end": 12612062 + "end": 117 }, - { - "start": 12618514, + "query_subranges": [ + { + "start": 54, + "end": 117 + }, + { + "start": 0, + "end": 54 + } + ], + "hit_range": { + "start": 12611999, "end": 12618568 - } - ] + }, + "hit_subranges": [ + { + "start": 1261199, + "end": 12612062 + }, + { + "start": 12618514, + "end": 12618568 + } + ] + } }, "urn:mavedb:00000068-a-1": { - "chrom": "chr17", - "strand": -1, - "coverage": 99.83079526226734, - "ident_pct": 99.91525423728814, - "query_range": { - "start": 0, - "end": 1180 - }, - "query_subranges": [ - { - "start": 1100, + "TP53 (P72R)": { + "chrom": "chr17", + "strand": -1, + "coverage": 99.83079526226734, + "ident_pct": 99.91525423728814, + "query_range": { + "start": 0, "end": 1180 }, - { - "start": 993, - "end": 1100 - }, - { - "start": 919, - "end": 993 - }, - { - "start": 782, - "end": 919 - }, - { - "start": 672, - "end": 782 - }, - { - "start": 559, - "end": 672 - }, - { - "start": 375, - "end": 559 - }, - { - "start": 96, - "end": 375 - }, - { - "start": 74, - "end": 96 - }, - { - "start": 0, - "end": 74 - } - ], - "hit_range": { - "start": 7676520, - "end": 7669690 - }, - "hit_subranges": [ - { - "start": 7669610, + "query_subranges": [ + { + "start": 1100, + "end": 1180 + }, + { + "start": 993, + "end": 1100 + }, + { + "start": 919, + "end": 993 + }, + { + "start": 782, + "end": 919 + }, + { + "start": 672, + "end": 782 + }, + { + "start": 559, + "end": 672 + }, + { + "start": 375, + "end": 559 + }, + { + "start": 96, + "end": 375 + }, + { + "start": 74, + "end": 96 + }, + { + "start": 0, + "end": 74 + } + ], + "hit_range": { + "start": 7676520, "end": 7669690 }, - { - "start": 7670608, - "end": 7670715 - }, - { - "start": 7673534, - "end": 7673608 - }, - { - "start": 7673700, - "end": 7673837 - }, - { - "start": 7674180, - "end": 7674290 - }, - { - "start": 7674858, - "end": 7674971 - }, - { - "start": 7675052, - "end": 7675236 - }, - { - "start": 7675993, - "end": 7676272 - }, - { - "start": 7676381, - "end": 7676403 - }, - { - "start": 7676520, - "end": 7676594 - } - ] + "hit_subranges": [ + { + "start": 7669610, + "end": 7669690 + }, + { + "start": 7670608, + "end": 7670715 + }, + { + "start": 7673534, + "end": 7673608 + }, + { + "start": 7673700, + "end": 7673837 + }, + { + "start": 7674180, + "end": 7674290 + }, + { + "start": 7674858, + "end": 7674971 + }, + { + "start": 7675052, + "end": 7675236 + }, + { + "start": 7675993, + "end": 7676272 + }, + { + "start": 7676381, + "end": 7676403 + }, + { + "start": 7676520, + "end": 7676594 + } + ] + } }, "urn:mavedb:00000093-a-1": { - "chrom": "chr17", - "strand": -1, - "coverage": 100.0, - "ident_pct": 100.0, - "query_range": { - "start": 0, - "end": 300 - }, - "query_subranges": [ - { - "start": 212, + "BRCA1 translation start through RING domain": { + "chrom": "chr17", + "strand": -1, + "coverage": 100.0, + "ident_pct": 100.0, + "query_range": { + "start": 0, "end": 300 }, - { - "start": 134, - "end": 212 - }, - { - "start": 80, - "end": 134 - }, - { - "start": 0, - "end": 80 - } - ], - "hit_range": { - "start": 43104868, - "end": 43124096 - }, - "hit_subranges": [ - { + "query_subranges": [ + { + "start": 212, + "end": 300 + }, + { + "start": 134, + "end": 212 + }, + { + "start": 80, + "end": 134 + }, + { + "start": 0, + "end": 80 + } + ], + "hit_range": { "start": 43104868, - "end": 43104956 - }, - { - "start": 43106455, - "end": 43106533 - }, - { - "start": 43115725, - "end": 43115779 - }, - { - "start": 43124016, "end": 43124096 - } - ] + }, + "hit_subranges": [ + { + "start": 43104868, + "end": 43104956 + }, + { + "start": 43106455, + "end": 43106533 + }, + { + "start": 43115725, + "end": 43115779 + }, + { + "start": 43124016, + "end": 43124096 + } + ] + } }, "urn:mavedb:00000001-b-2": { - "chrom": "chr2", - "strand": -1, - "coverage": 97.05882352941177, - "ident_pct": 100.0, - "query_range": { - "start": 9, - "end": 306 - }, - "query_subranges": [ - { - "start": 237, + "SUMO1": { + "chrom": "chr2", + "strand": -1, + "coverage": 97.05882352941177, + "ident_pct": 100.0, + "query_range": { + "start": 9, "end": 306 }, - { - "start": 165, - "end": 237 - }, - { - "start": 87, - "end": 165 - }, - { - "start": 9, - "end": 87 - } - ], - "hit_range": { - "start": 202207252, - "end": 202220109 - }, - "hit_subranges": [ - { + "query_subranges": [ + { + "start": 237, + "end": 306 + }, + { + "start": 165, + "end": 237 + }, + { + "start": 87, + "end": 165 + }, + { + "start": 9, + "end": 87 + } + ], + "hit_range": { "start": 202207252, - "end": 202207321 - }, - { - "start": 202210734, - "end": 202210806 - }, - { - "start": 202214356, - "end": 202214434 - }, - { - "start": 202220031, "end": 202220109 - } - ] + }, + "hit_subranges": [ + { + "start": 202207252, + "end": 202207321 + }, + { + "start": 202210734, + "end": 202210806 + }, + { + "start": 202214356, + "end": 202214434 + }, + { + "start": 202220031, + "end": 202220109 + } + ] + } } } diff --git a/tests/fixtures/scoreset_metadata.json b/tests/fixtures/scoreset_metadata.json index f0465db..c61098c 100644 --- a/tests/fixtures/scoreset_metadata.json +++ b/tests/fixtures/scoreset_metadata.json @@ -2,130 +2,162 @@ "scoreset_metadata": [ { "urn": "urn:mavedb:00000002-a-2", - "target_gene_name": "hYAP65 WW domain", - "target_gene_category": "protein_coding", - "target_sequence": "GACGTTCCACTGCCGGCTGGTTGGGAAATGGCTAAAACTAGTTCTGGTCAGCGTTACTTCCTGAACCACATCGACCAGACCACCACGTGGCAGGACCCGCGT", - "target_sequence_type": "dna", - "target_uniprot_ref": { - "id": "uniprot:P46937", - "offset": 169 + "target_genes": { + "hYAP65 WW domain": { + "target_gene_name": "hYAP65 WW domain", + "target_gene_category": "protein_coding", + "target_sequence": "GACGTTCCACTGCCGGCTGGTTGGGAAATGGCTAAAACTAGTTCTGGTCAGCGTTACTTCCTGAACCACATCGACCAGACCACCACGTGGCAGGACCCGCGT", + "target_sequence_type": "dna", + "target_uniprot_ref": { + "id": "uniprot:P46937", + "offset": 169 + } + } } }, { "urn": "urn:mavedb:00000099-a-1", - "target_gene_name": "RHO", - "target_gene_category": "protein_coding", - "target_sequence": "ATGAATGGCACAGAAGGCCCTAACTTCTACGTGCCCTTCTCCAATGCGACGGGTGTGGTACGCAGCCCCTTCGAGTACCCACAGTACTACCTGGCTGAGCCATGGCAGTTCTCCATGCTGGCCGCCTACATGTTTCTGCTGATCGTGCTGGGCTTCCCCATCAACTTCCTCACGCTCTACGTCACCGTCCAGCACAAGAAGCTGCGCACGCCTCTCAACTACATCCTGCTCAACCTAGCCGTGGCTGACCTCTTCATGGTCCTAGGTGGCTTCACCAGCACCCTCTACACCTCTCTGCATGGATACTTCGTCTTCGGGCCCACAGGATGCAATTTGGAGGGCTTCTTTGCCACCCTGGGCGGTGAAATTGCCCTGTGGTCCTTGGTGGTCCTGGCCATCGAGCGGTACGTGGTGGTGTGTAAGCCCATGAGCAACTTCCGCTTCGGGGAGAACCATGCCATCATGGGCGTTGCCTTCACCTGGGTCATGGCGCTGGCCTGCGCCGCACCCCCACTCGCCGGCTGGTCCAGGTACATCCCCGAGGGCCTGCAGTGCTCGTGTGGAATCGACTACTACACGCTCAAGCCGGAGGTCAACAACGAGTCTTTTGTCATCTACATGTTCGTGGTCCACTTCACCATCCCCATGATTATCATCTTTTTCTGCTATGGGCAGCTCGTCTTCACCGTCAAGGAGGCCGCTGCCCAGCAGCAGGAGTCAGCCACCACACAGAAGGCAGAGAAGGAGGTCACCCGCATGGTCATCATCATGGTCATCGCTTTCCTGATCTGCTGGGTGCCCTACGCCAGCGTGGCATTCTACATCTTCACCCACCAGGGCTCCAACTTCGGTCCCATCTTCATGACCATCCCAGCGTTCTTTGCCAAGAGCGCCGCCATCTACAACCCTGTCATCTATATCATGATGAACAAGCAGTTCCGGAACTGCATGCTCACCACCATCTGCTGCGGCAAGAACCCACTGGGTGACGATGAGGCCTCTGCTACCGTGTCCAAGACGGAGACGAGCCAGGTGGCCCCGGCCTAA", - "target_sequence_type": "dna", - "target_uniprot_ref": null + "target_genes": { + "RHO": { + "target_gene_name": "RHO", + "target_gene_category": "protein_coding", + "target_sequence": "ATGAATGGCACAGAAGGCCCTAACTTCTACGTGCCCTTCTCCAATGCGACGGGTGTGGTACGCAGCCCCTTCGAGTACCCACAGTACTACCTGGCTGAGCCATGGCAGTTCTCCATGCTGGCCGCCTACATGTTTCTGCTGATCGTGCTGGGCTTCCCCATCAACTTCCTCACGCTCTACGTCACCGTCCAGCACAAGAAGCTGCGCACGCCTCTCAACTACATCCTGCTCAACCTAGCCGTGGCTGACCTCTTCATGGTCCTAGGTGGCTTCACCAGCACCCTCTACACCTCTCTGCATGGATACTTCGTCTTCGGGCCCACAGGATGCAATTTGGAGGGCTTCTTTGCCACCCTGGGCGGTGAAATTGCCCTGTGGTCCTTGGTGGTCCTGGCCATCGAGCGGTACGTGGTGGTGTGTAAGCCCATGAGCAACTTCCGCTTCGGGGAGAACCATGCCATCATGGGCGTTGCCTTCACCTGGGTCATGGCGCTGGCCTGCGCCGCACCCCCACTCGCCGGCTGGTCCAGGTACATCCCCGAGGGCCTGCAGTGCTCGTGTGGAATCGACTACTACACGCTCAAGCCGGAGGTCAACAACGAGTCTTTTGTCATCTACATGTTCGTGGTCCACTTCACCATCCCCATGATTATCATCTTTTTCTGCTATGGGCAGCTCGTCTTCACCGTCAAGGAGGCCGCTGCCCAGCAGCAGGAGTCAGCCACCACACAGAAGGCAGAGAAGGAGGTCACCCGCATGGTCATCATCATGGTCATCGCTTTCCTGATCTGCTGGGTGCCCTACGCCAGCGTGGCATTCTACATCTTCACCCACCAGGGCTCCAACTTCGGTCCCATCTTCATGACCATCCCAGCGTTCTTTGCCAAGAGCGCCGCCATCTACAACCCTGTCATCTATATCATGATGAACAAGCAGTTCCGGAACTGCATGCTCACCACCATCTGCTGCGGCAAGAACCCACTGGGTGACGATGAGGCCTCTGCTACCGTGTCCAAGACGGAGACGAGCCAGGTGGCCCCGGCCTAA", + "target_sequence_type": "dna", + "target_uniprot_ref": null + } + } }, { "urn": "urn:mavedb:00000103-c-1", - "target_gene_name": "MAPK1", - "target_gene_category": "protein_coding", - "target_sequence": "MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS", - "target_sequence_type": "protein", - "target_uniprot_ref": null + "target_genes": { + "MAPK1": { + "target_gene_name": "MAPK1", + "target_gene_category": "protein_coding", + "target_sequence": "MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS", + "target_sequence_type": "protein", + "target_uniprot_ref": null + } + } }, { "urn": "urn:mavedb:00000041-a-1", - "target_gene_name": "Src catalytic domain", - "target_gene_category": "protein_coding", - "target_sequence": "CTGCGGCTGGAGGTCAAGCTGGGCCAGGGCTGCTTTGGCGAGGTGTGGATGGGGACCTGGAACGGTACCACCAGGGTGGCCATCAAAACCCTGAAGCCTGGCACGATGTCTCCAGAGGCCTTCCTGCAGGAGGCCCAGGTCATGAAGAAGCTGAGGCATGAGAAGCTGGTGCAGTTGTATGCTGTGGTTTCAGAGGAGCCCATTTACATCGTCACGGAGTACATGAGCAAGGGGAGTTTGCTGGACTTTCTCAAGGGGGAGACAGGCAAGTACCTGCGGCTGCCTCAGCTGGTGGACATGGCTGCTCAGATCGCCTCAGGCATGGCGTACGTGGAGCGGATGAACTACGTCCACCGGGACCTTCGTGCAGCCAACATCCTGGTGGGAGAGAACCTGGTGTGCAAAGTGGCCGACTTTGGGCTGGCTCGGCTCATTGAAGACAATGAGTACACGGCGCGGCAAGGTGCCAAATTCCCCATCAAGTGGACGGCTCCAGAAGCTGCCCTCTATGGCCGCTTCACCATCAAGTCGGACGTGTGGTCCTTCGGGATCCTGCTGACTGAGCTCACCACAAAGGGACGGGTGCCCTACCCTGGGATGGTGAACCGCGAGGTGCTGGACCAGGTGGAGCGGGGCTACCGGATGCCCTGCCCGCCGGAGTGTCCCGAGTCCCTGCACGACCTCATGTGCCAGTGCTGGCGGAAGGAGCCTGAGGAGCGGCCCACCTTCGAGTACCTGCAGGCCTTCCTG", - "target_sequence_type": "dna", - "target_reference_genome": "hg38", - "target_uniprot_ref": { - "id": "uniprot:P12931", - "offset": 269 + "target_genes": { + "Src catalytic domain": { + "target_gene_name": "Src catalytic domain", + "target_gene_category": "protein_coding", + "target_sequence": "CTGCGGCTGGAGGTCAAGCTGGGCCAGGGCTGCTTTGGCGAGGTGTGGATGGGGACCTGGAACGGTACCACCAGGGTGGCCATCAAAACCCTGAAGCCTGGCACGATGTCTCCAGAGGCCTTCCTGCAGGAGGCCCAGGTCATGAAGAAGCTGAGGCATGAGAAGCTGGTGCAGTTGTATGCTGTGGTTTCAGAGGAGCCCATTTACATCGTCACGGAGTACATGAGCAAGGGGAGTTTGCTGGACTTTCTCAAGGGGGAGACAGGCAAGTACCTGCGGCTGCCTCAGCTGGTGGACATGGCTGCTCAGATCGCCTCAGGCATGGCGTACGTGGAGCGGATGAACTACGTCCACCGGGACCTTCGTGCAGCCAACATCCTGGTGGGAGAGAACCTGGTGTGCAAAGTGGCCGACTTTGGGCTGGCTCGGCTCATTGAAGACAATGAGTACACGGCGCGGCAAGGTGCCAAATTCCCCATCAAGTGGACGGCTCCAGAAGCTGCCCTCTATGGCCGCTTCACCATCAAGTCGGACGTGTGGTCCTTCGGGATCCTGCTGACTGAGCTCACCACAAAGGGACGGGTGCCCTACCCTGGGATGGTGAACCGCGAGGTGCTGGACCAGGTGGAGCGGGGCTACCGGATGCCCTGCCCGCCGGAGTGTCCCGAGTCCCTGCACGACCTCATGTGCCAGTGCTGGCGGAAGGAGCCTGAGGAGCGGCCCACCTTCGAGTACCTGCAGGCCTTCCTG", + "target_sequence_type": "dna", + "target_reference_genome": "hg38", + "target_uniprot_ref": { + "id": "uniprot:P12931", + "offset": 269 + } + } } }, { "urn": "urn:mavedb:00000018-a-1", - "target_gene_name": "HBB promoter", - "target_gene_category": "regulatory", - "target_sequence": "GGTGTCTGTTTGAGGTTGCTAGTGAACACAGTTGTGTCAGAAGCAAATGTAAGCAATAGATGGCTCTGCCCTGACTTTTATGCCCAGCCCTGGCTCCTGCCCTCCCTGCTCCTGGGAGTAGATTGGCCAACCCTAGGGTGTGGCTCCACAGGGTGAGGTCTAAGTGATGACAGCCGTACCTGTCCTT", - "target_sequence_type": "dna", - "target_reference_genome": "hg38", - "target_uniprot_ref": null - }, - { - "urn": "urn:mavedb:00000001-a-4", - "target_gene_name": "UBE2I", - "target_gene_category": "protein_coding", - "target_sequence": "ATGTCGGGGATCGCCCTCAGCAGACTCGCCCAGGAGAGGAAAGCATGGAGGAAAGACCACCCATTTGGTTTCGTGGCTGTCCCAACAAAAAATCCCGATGGCACGATGAACCTCATGAACTGGGAGTGCGCCATTCCAGGAAAGAAAGGGACTCCGTGGGAAGGAGGCTTGTTTAAACTACGGATGCTTTTCAAAGATGATTATCCATCTTCGCCACCAAAATGTAAATTCGAACCACCATTATTTCACCCGAATGTGTACCCTTCGGGGACAGTGTGCCTGTCCATCTTAGAGGAGGACAAGGACTGGAGGCCAGCCATCACAATCAAACAGATCCTATTAGGAATACAGGAACTTCTAAATGAACCAAATATCCAAGACCCAGCTCAAGCAGAGGCCTACACGATTTACTGCCAAAACAGAGTGGAGTACGAGAAAAGGGTCCGAGCACAAGCCAAGAAGTTTGCGCCCTCATAA", - "target_sequence_type": "dna", - "target_reference_genome": "hg38", - "target_uniprot_ref": { - "id": "uniprot:P63279", - "offset": 0 + "target_genes": { + "HBB promoter": { + "target_gene_name": "HBB promoter", + "target_gene_category": "regulatory", + "target_sequence": "GGTGTCTGTTTGAGGTTGCTAGTGAACACAGTTGTGTCAGAAGCAAATGTAAGCAATAGATGGCTCTGCCCTGACTTTTATGCCCAGCCCTGGCTCCTGCCCTCCCTGCTCCTGGGAGTAGATTGGCCAACCCTAGGGTGTGGCTCCACAGGGTGAGGTCTAAGTGATGACAGCCGTACCTGTCCTT", + "target_sequence_type": "dna", + "target_reference_genome": "hg38", + "target_uniprot_ref": null + } } }, { - "urn": "urn:mavedb:00000113-a-2", - "target_gene_name": "APP", - "target_gene_category": "protein_coding", - "target_sequence": "DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA", - "target_sequence_type": "protein", - "target_reference_genome": "hg38", - "target_uniprot_ref": { - "id": "uniprot:P05067", - "offset": 0 + "urn": "urn:mavedb:00000001-a-4", + "target_genes": { + "UBE2I": { + "target_gene_name": "UBE2I", + "target_gene_category": "protein_coding", + "target_sequence": "ATGTCGGGGATCGCCCTCAGCAGACTCGCCCAGGAGAGGAAAGCATGGAGGAAAGACCACCCATTTGGTTTCGTGGCTGTCCCAACAAAAAATCCCGATGGCACGATGAACCTCATGAACTGGGAGTGCGCCATTCCAGGAAAGAAAGGGACTCCGTGGGAAGGAGGCTTGTTTAAACTACGGATGCTTTTCAAAGATGATTATCCATCTTCGCCACCAAAATGTAAATTCGAACCACCATTATTTCACCCGAATGTGTACCCTTCGGGGACAGTGTGCCTGTCCATCTTAGAGGAGGACAAGGACTGGAGGCCAGCCATCACAATCAAACAGATCCTATTAGGAATACAGGAACTTCTAAATGAACCAAATATCCAAGACCCAGCTCAAGCAGAGGCCTACACGATTTACTGCCAAAACAGAGTGGAGTACGAGAAAAGGGTCCGAGCACAAGCCAAGAAGTTTGCGCCCTCATAA", + "target_sequence_type": "dna", + "target_reference_genome": "hg38", + "target_uniprot_ref": { + "id": "uniprot:P63279", + "offset": 0 + } + } } }, { "urn": "urn:mavedb:00000098-a-1", - "target_gene_name": "SCN5A", - "target_gene_category": "protein_coding", - "target_sequence": "LFRVIRLARIGR", - "target_sequence_type": "protein", - "target_reference_genome": "hg38", - "target_uniprot_ref": { - "id": "uniprot:Q14524", - "offset": 1620 + "target_genes": { + "SCN5A": { + "target_gene_name": "SCN5A", + "target_gene_category": "protein_coding", + "target_sequence": "LFRVIRLARIGR", + "target_sequence_type": "protein", + "target_reference_genome": "hg38", + "target_uniprot_ref": { + "id": "uniprot:Q14524", + "offset": 1620 + } + } } }, { "urn": "urn:mavedb:00000061-h-1", - "target_gene_name": "RAF", - "target_gene_category": "protein_coding", - "target_sequence": "TCTAAGACAAGCAACACTATCCGTGTTTTCTTGCCGAACAAGCAAAGAACAGTGGTCAATGTGCGAAATGGAATGAGCTTGCATGACTGCCTTATGAAAGCACTCAAGGTGAGGGGC", - "target_sequence_type": "dna", - "target_reference_genome": "hg38", - "target_uniprot_ref": { - "id": "uniprot:P04049", - "offset": 51 + "target_genes": { + "RAF": { + "target_gene_name": "RAF", + "target_gene_category": "protein_coding", + "target_sequence": "TCTAAGACAAGCAACACTATCCGTGTTTTCTTGCCGAACAAGCAAAGAACAGTGGTCAATGTGCGAAATGGAATGAGCTTGCATGACTGCCTTATGAAAGCACTCAAGGTGAGGGGC", + "target_sequence_type": "dna", + "target_reference_genome": "hg38", + "target_uniprot_ref": { + "id": "uniprot:P04049", + "offset": 51 + } + } } }, { "urn": "urn:mavedb:00000068-a-1", - "target_gene_name": "TP53 (P72R)", - "target_gene_category": "protein_coding", - "target_sequence": "ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCGCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTAG", - "target_sequence_type": "dna", - "target_reference_genome": "hg38", - "target_uniprot_ref": null + "target_genes": { + "TP53 (P72R)": { + "target_gene_name": "TP53 (P72R)", + "target_gene_category": "protein_coding", + "target_sequence": "ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCGCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTAG", + "target_sequence_type": "dna", + "target_reference_genome": "hg38", + "target_uniprot_ref": null + } + } }, { "urn": "urn:mavedb:00000093-a-1", - "target_gene_name": "BRCA1 translation start through RING domain", - "target_gene_category": "protein_coding", - "target_sequence": "ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGGTTTGGAG", - "target_sequence_type": "dna", - "target_reference_genome": "hg19", - "target_uniprot_ref": { - "id": "uniprot:P38398", - "offset": 0 + "target_genes": { + "BRCA1 translation start through RING domain": { + "target_gene_name": "BRCA1 translation start through RING domain", + "target_gene_category": "protein_coding", + "target_sequence": "ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGGTTTGGAG", + "target_sequence_type": "dna", + "target_reference_genome": "hg19", + "target_uniprot_ref": { + "id": "uniprot:P38398", + "offset": 0 + } + } } }, { "urn": "urn:mavedb:00000001-b-2", - "target_gene_name": "SUMO1", - "target_gene_category": "protein_coding", - "target_sequence": "ATGTCTGACCAGGAGGCAAAACCTTCAACTGAGGACTTGGGGGATAAGAAGGAAGGTGAATATATTAAACTCAAAGTCATTGGACAGGATAGCAGTGAGATTCACTTCAAAGTGAAAATGACAACACATCTCAAGAAACTCAAAGAATCATACTGTCAAAGACAGGGTGTTCCAATGAATTCACTCAGGTTTCTCTTTGAGGGTCAGAGAATTGCTGATAATCATACTCCAAAAGAACTGGGAATGGAGGAAGAAGATGTGATTGAAGTTTATCAGGAACAAACGGGGGGTCATTCAACAGTTTAG", - "target_sequence_type": "dna", - "target_uniprot_ref": { - "id": "uniprot:P63165", - "offset": 0 + "target_genes": { + "SUMO1": { + "target_gene_name": "SUMO1", + "target_gene_category": "protein_coding", + "target_sequence": "ATGTCTGACCAGGAGGCAAAACCTTCAACTGAGGACTTGGGGGATAAGAAGGAAGGTGAATATATTAAACTCAAAGTCATTGGACAGGATAGCAGTGAGATTCACTTCAAAGTGAAAATGACAACACATCTCAAGAAACTCAAAGAATCATACTGTCAAAGACAGGGTGTTCCAATGAATTCACTCAGGTTTCTCTTTGAGGGTCAGAGAATTGCTGATAATCATACTCCAAAAGAACTGGGAATGGAGGAAGAAGATGTGATTGAAGTTTATCAGGAACAAACGGGGGGTCATTCAACAGTTTAG", + "target_sequence_type": "dna", + "target_uniprot_ref": { + "id": "uniprot:P63165", + "offset": 0 + } + } } } ] diff --git a/tests/fixtures/test.fasta b/tests/fixtures/test.fasta new file mode 100644 index 0000000..b310257 --- /dev/null +++ b/tests/fixtures/test.fasta @@ -0,0 +1,2 @@ +>test_contig_random_sequence +ATTCCATTGCTATATTGCCGGGATGCAATTTTAGCCCGAATATTTATAGCGCGCGTATGATCCTACATC diff --git a/tests/fixtures/test.fasta.fai b/tests/fixtures/test.fasta.fai new file mode 100644 index 0000000..2ebabf3 --- /dev/null +++ b/tests/fixtures/test.fasta.fai @@ -0,0 +1 @@ +test_contig_random_sequence 69 29 69 70 diff --git a/tests/fixtures/transcript_result.json b/tests/fixtures/transcript_result.json index f93d5a9..6d3564a 100644 --- a/tests/fixtures/transcript_result.json +++ b/tests/fixtures/transcript_result.json @@ -1,82 +1,102 @@ { "urn:mavedb:00000002-a-2": { - "nm": "NM_001130145.3", - "np": "NP_001123617.1", - "start": 169, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPR" + "hYAP65 WW domain": { + "nm": "NM_001130145.3", + "np": "NP_001123617.1", + "start": 169, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPR" + } }, "urn:mavedb:00000099-a-1": { - "nm": "NM_000539.3", - "np": "NP_000530.1", - "start": 0, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA" + "RHO": { + "nm": "NM_000539.3", + "np": "NP_000530.1", + "start": 0, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA" + } }, "urn:mavedb:00000103-c-1": { - "nm": "NM_002745.5", - "np": "NP_002736.3", - "start": 0, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS" + "MAPK1": { + "nm": "NM_002745.5", + "np": "NP_002736.3", + "start": 0, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS" + } }, "urn:mavedb:00000041-a-1": { - "nm": "NM_198291.3", - "np": "NP_938033.1", - "start": 269, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "LRLEVKLGQGCFGEVWMGTWNGTTRVAIKTLKPGTMSPEAFLQEAQVMKKLRHEKLVQLYAVVSEEPIYIVTEYMSKGSLLDFLKGETGKYLRLPQLVDMAAQIASGMAYVERMNYVHRDLRAANILVGENLVCKVADFGLARLIEDNEYTARQGAKFPIKWTAPEAALYGRFTIKSDVWSFGILLTELTTKGRVPYPGMVNREVLDQVERGYRMPCPPECPESLHDLMCQCWRKEPEERPTFEYLQAFL" + "Src catalytic domain": { + "nm": "NM_198291.3", + "np": "NP_938033.1", + "start": 269, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "LRLEVKLGQGCFGEVWMGTWNGTTRVAIKTLKPGTMSPEAFLQEAQVMKKLRHEKLVQLYAVVSEEPIYIVTEYMSKGSLLDFLKGETGKYLRLPQLVDMAAQIASGMAYVERMNYVHRDLRAANILVGENLVCKVADFGLARLIEDNEYTARQGAKFPIKWTAPEAALYGRFTIKSDVWSFGILLTELTTKGRVPYPGMVNREVLDQVERGYRMPCPPECPESLHDLMCQCWRKEPEERPTFEYLQAFL" + } }, "urn:mavedb:00000001-a-4": { - "nm": "NM_003345.5", - "np": "NP_003336.1", - "start": 0, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "MSGIALSRLAQERKAWRKDHPFGFVAVPTKNPDGTMNLMNWECAIPGKKGTPWEGGLFKLRMLFKDDYPSSPPKCKFEPPLFHPNVYPSGTVCLSILEEDKDWRPAITIKQILLGIQELLNEPNIQDPAQAEAYTIYCQNRVEYEKRVRAQAKKFAPS" + "UBE2I": { + "nm": "NM_003345.5", + "np": "NP_003336.1", + "start": 0, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "MSGIALSRLAQERKAWRKDHPFGFVAVPTKNPDGTMNLMNWECAIPGKKGTPWEGGLFKLRMLFKDDYPSSPPKCKFEPPLFHPNVYPSGTVCLSILEEDKDWRPAITIKQILLGIQELLNEPNIQDPAQAEAYTIYCQNRVEYEKRVRAQAKKFAPS" + } }, "urn:mavedb:00000098-a-1": { - "nm": "NM_000335.5", - "np": "NP_000326.2", - "start": 1619, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "LFRVIRLARIGR" + "SCN5A": { + "nm": "NM_000335.5", + "np": "NP_000326.2", + "start": 1619, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "LFRVIRLARIGR" + } }, "urn:mavedb:00000061-h-1": { - "nm": "NM_002880.4", - "np": "NP_002871.1", - "start": 51, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "SKTSNTIRVFLPNKQRTVVNVRNGMSLHDCLMKALKVRG" + "RAF": { + "nm": "NM_002880.4", + "np": "NP_002871.1", + "start": 51, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "SKTSNTIRVFLPNKQRTVVNVRNGMSLHDCLMKALKVRG" + } }, "urn:mavedb:00000068-a-1": { - "nm": "NM_000546.6", - "np": "NP_000537.3", - "start": 0, - "is_full_match": false, - "transcript_mode": "mane_select", - "sequence": "MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD" + "TP53 (P72R)": { + "nm": "NM_000546.6", + "np": "NP_000537.3", + "start": 0, + "is_full_match": false, + "transcript_mode": "mane_select", + "sequence": "MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD" + } }, "urn:mavedb:00000093-a-1": { - "nm": "NM_007294.4", - "np": "NP_009225.1", - "start": 0, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQCPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLE" + "BRCA1 translation start through RING domain": { + "nm": "NM_007294.4", + "np": "NP_009225.1", + "start": 0, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQCPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLE" + } }, "urn:mavedb:00000001-b-2": { - "nm": "NM_003352.8", - "np": "NP_003343.1", - "start": 0, - "is_full_match": true, - "transcript_mode": "mane_select", - "sequence": "MSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMNSLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV" + "SUMO1": { + "nm": "NM_003352.8", + "np": "NP_003343.1", + "start": 0, + "is_full_match": true, + "transcript_mode": "mane_select", + "sequence": "MSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMNSLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV" + } } } diff --git a/tests/test_mavedb_data.py b/tests/test_mavedb_data.py index 31258d6..751d0c7 100644 --- a/tests/test_mavedb_data.py +++ b/tests/test_mavedb_data.py @@ -41,19 +41,42 @@ def test_get_scoreset_metadata( urn, dcd_mapping_dir=resources_data_dir ) assert scoreset_metadata.urn == urn - assert scoreset_metadata.target_uniprot_ref - assert scoreset_metadata.target_uniprot_ref.id == "uniprot:P38398" - assert scoreset_metadata.target_uniprot_ref.offset == 0 + assert scoreset_metadata.target_genes + assert ( + scoreset_metadata.target_genes[ + "BRCA1 translation start through RING domain" + ].target_uniprot_ref.id + == "uniprot:P38398" + ) + assert ( + scoreset_metadata.target_genes[ + "BRCA1 translation start through RING domain" + ].target_uniprot_ref.offset + == 0 + ) -def test_get_scoreset_records(resources_data_dir: Path, fixture_data_dir: Path): +def test_get_scoreset_records( + resources_data_dir: Path, fixture_data_dir: Path, scoreset_metadata_response: dict +): urn = "urn:mavedb:00000093-a-1" with (fixture_data_dir / f"{urn}_scores.csv").open() as f: scores_csv_text = f.read() with requests_mock.Mocker() as m: + m.get( + f"https://api.mavedb.org/api/v1/score-sets/{urn}", + json=scoreset_metadata_response[urn], + ) + scoreset_metadata = get_scoreset_metadata( + urn, dcd_mapping_dir=resources_data_dir + ) m.get( f"https://api.mavedb.org/api/v1/score-sets/{urn}/scores", text=scores_csv_text, ) - scoreset_records = get_scoreset_records(urn, dcd_mapping_dir=resources_data_dir) - assert len(scoreset_records) == 853 + scoreset_records = get_scoreset_records( + scoreset_metadata, dcd_mapping_dir=resources_data_dir + ) + assert ( + len(scoreset_records["BRCA1 translation start through RING domain"]) == 853 + ) diff --git a/tests/test_vrs_map.py b/tests/test_vrs_map.py index 37133c2..20b0244 100644 --- a/tests/test_vrs_map.py +++ b/tests/test_vrs_map.py @@ -4,10 +4,12 @@ we're focused on remaining consistent w/ previous results. * Move expected data into a separate JSON file or something? """ +from collections.abc import Generator from pathlib import Path -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest +from cdot.hgvs.dataproviders import ChainedSeqFetcher from cool_seq_tool.schemas import AnnotationLayer from ga4gh.vrs._internal.models import Allele, Haplotype @@ -74,7 +76,9 @@ def get_fixtures( ): def _get_fixtures(urn: str): return ( - _load_scoreset_records(fixture_data_dir / f"{urn}_scores.csv"), + _load_scoreset_records( + fixture_data_dir / f"{urn}_scores.csv", scoreset_metadata_fixture[urn] + ), scoreset_metadata_fixture[urn], align_result_fixture[urn], transcript_results_fixture[urn], @@ -86,6 +90,7 @@ def _get_fixtures(urn: str): def test_2_a_2( get_fixtures, mock_seqrepo_access: MagicMock, + data_provider: Generator[ChainedSeqFetcher, None, None], ): urn = "urn:mavedb:00000002-a-2" records, metadata, align_result, tx_result = get_fixtures(urn) @@ -102,11 +107,20 @@ def test_2_a_2( }, } - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - assert len(mappings) == 1 + mappings = {} + with patch("dcd_mapping.lookup.seqfetcher") as mock_cdot_seqfetcher: + mock_cdot_seqfetcher.return_value = data_provider + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) + assert mappings["hYAP65 WW domain"] is not None + assert len(mappings["hYAP65 WW domain"]) == 1 - for m in mappings: + for m in mappings["hYAP65 WW domain"]: _assert_correct_vrs_map(m, expected_mappings_data) store_calls = [ @@ -127,6 +141,7 @@ def test_2_a_2( def test_41_a_1( get_fixtures, mock_seqrepo_access: MagicMock, + data_provider: Generator[ChainedSeqFetcher, None, None], ): urn = "urn:mavedb:00000041-a-1" records, metadata, align_result, tx_result = get_fixtures(urn) @@ -154,11 +169,20 @@ def test_41_a_1( }, } - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - assert len(mappings) == 5 + mappings = {} + with patch("dcd_mapping.lookup.seqfetcher") as mock_cdot_seqfetcher: + mock_cdot_seqfetcher.return_value = data_provider + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) + assert mappings["Src catalytic domain"] is not None + assert len(mappings["Src catalytic domain"]) == 5 - for m in mappings: + for m in mappings["Src catalytic domain"]: _assert_correct_vrs_map(m, expected_mappings_data) store_calls = [ @@ -179,6 +203,7 @@ def test_41_a_1( def test_99_a_1( get_fixtures, mock_seqrepo_access: MagicMock, + data_provider: Generator[ChainedSeqFetcher, None, None], ): urn = "urn:mavedb:00000099-a-1" records, metadata, align_result, tx_result = get_fixtures(urn) @@ -217,11 +242,20 @@ def test_99_a_1( "post_mapped": "ga4gh:VA.HSfipwsg28LbqwITCawzumz_OWZYu_jM", }, } - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - assert len(mappings) == 8 # includes protein and genomic for all 4 rows + mappings = {} + with patch("dcd_mapping.lookup.seqfetcher") as mock_cdot_seqfetcher: + mock_cdot_seqfetcher.return_value = data_provider + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) + assert mappings["RHO"] is not None + assert len(mappings["RHO"]) == 8 # includes protein and genomic for all 4 rows - for m in mappings: + for m in mappings["RHO"]: _assert_correct_vrs_map(m, expected_mappings_data) store_calls = [ @@ -242,6 +276,7 @@ def test_99_a_1( def test_103_c_1( get_fixtures, mock_seqrepo_access: MagicMock, + data_provider: Generator[ChainedSeqFetcher, None, None], ): urn = "urn:mavedb:00000103-c-1" records, metadata, align_result, tx_result = get_fixtures(urn) @@ -265,10 +300,19 @@ def test_103_c_1( }, } - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - assert len(mappings) == 4 - for m in mappings: + mappings = {} + with patch("dcd_mapping.lookup.seqfetcher") as mock_cdot_seqfetcher: + mock_cdot_seqfetcher.return_value = data_provider + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) + assert mappings["MAPK1"] is not None + assert len(mappings["MAPK1"]) == 4 + for m in mappings["MAPK1"]: _assert_correct_vrs_map(m, expected_mappings_data) store_calls = [ @@ -285,13 +329,11 @@ def test_103_c_1( def test_1_b_2( get_fixtures, mock_seqrepo_access: MagicMock, + data_provider: Generator[ChainedSeqFetcher, None, None], ): urn = "urn:mavedb:00000001-b-2" records, metadata, align_result, tx_result = get_fixtures(urn) - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - expected_mappings_data = { ("urn:mavedb:00000001-b-2#444", AnnotationLayer.PROTEIN): { "pre_mapped": "ga4gh:VA.ojIs4GEPbxiMt5E6InnF6k6m9ix_z3SH", @@ -327,10 +369,19 @@ def test_1_b_2( }, } - mappings = vrs_map(metadata, align_result, records, transcript=tx_result) - assert mappings is not None - assert len(mappings) == 8 - for m in mappings: + mappings = {} + with patch("dcd_mapping.lookup.seqfetcher") as mock_cdot_seqfetcher: + mock_cdot_seqfetcher.return_value = data_provider + for target_gene in metadata.target_genes: + mappings[target_gene] = vrs_map( + metadata=metadata.target_genes[target_gene], + align_result=align_result[target_gene], + records=records[target_gene], + transcript=tx_result[target_gene], + ) + assert mappings["SUMO1"] is not None + assert len(mappings["SUMO1"]) == 8 + for m in mappings["SUMO1"]: _assert_correct_vrs_map(m, expected_mappings_data) store_calls = [ @@ -342,14 +393,6 @@ def test_1_b_2( "ATGTCTGACCAGGAGGCAAAACCTTCAACTGAGGACTTGGGGGATAAGAAGGAAGGTGAATATATTAAACTCAAAGTCATTGGACAGGATAGCAGTGAGATTCACTTCAAAGTGAAAATGACAACACATCTCAAGAAACTCAAAGAATCATACTGTCAAAGACAGGGTGTTCCAATGAATTCACTCAGGTTTCTCTTTGAGGGTCAGAGAATTGCTGATAATCATACTCCAAAAGAACTGGGAATGGAGGAAGAAGATGTGATTGAAGTTTATCAGGAACAAACGGGGGGTCATTCAACAGTTTAG", [{"namespace": "ga4gh", "alias": "SQ.i1KiGldkfULl1XcEI-XBwhiM7x3PK5Xk"}], ), - ( - "MSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMNSLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV", - [{"namespace": "ga4gh", "alias": "SQ.VkCzFNsbifqfq61Mud6oGmz0ID6CLIip"}], - ), - ( - "ATGTCTGACCAGGAGGCAAAACCTTCAACTGAGGACTTGGGGGATAAGAAGGAAGGTGAATATATTAAACTCAAAGTCATTGGACAGGATAGCAGTGAGATTCACTTCAAAGTGAAAATGACAACACATCTCAAGAAACTCAAAGAATCATACTGTCAAAGACAGGGTGTTCCAATGAATTCACTCAGGTTTCTCTTTGAGGGTCAGAGAATTGCTGATAATCATACTCCAAAAGAACTGGGAATGGAGGAAGAAGATGTGATTGAAGTTTATCAGGAACAAACGGGGGGTCATTCAACAGTTTAG", - [{"namespace": "ga4gh", "alias": "SQ.i1KiGldkfULl1XcEI-XBwhiM7x3PK5Xk"}], - ), ] for call in store_calls: mock_seqrepo_access.sr.store.assert_any_call(*call)