diff --git a/schema.json b/schema.json index 737d844..b017cc3 100644 --- a/schema.json +++ b/schema.json @@ -370,11 +370,7 @@ "title": "Version" }, "code": { - "allOf": [ - { - "$ref": "#/$defs/Code" - } - ], + "$ref": "#/$defs/Code", "description": "A symbol uniquely identifying the concept, as in a syntax defined by the code system. CURIE format is preferred where possible (e.g. 'SO:0000704' is the CURIE form of the Sequence Ontology code for 'gene')." } }, @@ -412,19 +408,11 @@ "description": "A mapping to a concept in a terminology or code system.", "properties": { "coding": { - "allOf": [ - { - "$ref": "#/$defs/Coding" - } - ], + "$ref": "#/$defs/Coding", "description": "A structured representation of a code for a defined concept in a terminology or code system." }, "relation": { - "allOf": [ - { - "$ref": "#/$defs/Relation" - } - ], + "$ref": "#/$defs/Relation", "description": "A mapping relation between concepts as defined by the Simple Knowledge Organization System (SKOS)." } }, @@ -439,11 +427,7 @@ "description": "Representation of a variation by a specified nomenclature or syntax for a\nVariation object. Common examples of expressions for the description of molecular\nvariation include the HGVS and ISCN nomenclatures.", "properties": { "syntax": { - "allOf": [ - { - "$ref": "#/$defs/Syntax" - } - ], + "$ref": "#/$defs/Syntax", "description": "The syntax used to describe the variation. The value should be one of the supported syntaxes." }, "value": { @@ -751,11 +735,7 @@ "title": "Mappings" }, "sequence": { - "allOf": [ - { - "$ref": "#/$defs/SequenceString" - } - ], + "$ref": "#/$defs/SequenceString", "description": "the literal sequence" } }, @@ -972,21 +952,24 @@ "pre_mapped": { "anyOf": [ { - "$ref": "#/$defs/Allele" + "$ref": "#/$defs/CisPhasedBlock" }, { - "$ref": "#/$defs/CisPhasedBlock" + "$ref": "#/$defs/Allele" } ], "title": "Pre Mapped" }, "post_mapped": { "anyOf": [ + { + "$ref": "#/$defs/CisPhasedBlock" + }, { "$ref": "#/$defs/Allele" }, { - "$ref": "#/$defs/CisPhasedBlock" + "type": "null" } ], "title": "Post Mapped" diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index cdadb2f..3fbdf35 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -165,9 +165,16 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult: """ with tempfile.NamedTemporaryFile() as query_file: query_file = _build_query_file(metadata, Path(query_file.name)) + if len(metadata.target_sequence) > 25000: + msg = f"Target sequence for {metadata.urn} must have a length <= 25000 to run BLAT" + raise AlignmentError(msg) + if metadata.target_sequence_type == TargetSequenceType.PROTEIN: target_args = "-q=prot -t=dnax" - elif metadata.target_gene_category == TargetType.PROTEIN_CODING: + elif ( + metadata.target_gene_category == TargetType.PROTEIN_CODING + and len(metadata.target_sequence) <= 10000 + ): target_args = "-q=dnax -t=dnax" else: target_args = "" diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 3c5e964..0c89942 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -21,6 +21,7 @@ CisPhasedBlock, Expression, LiteralSequenceExpression, + SequenceString, ) from dcd_mapping.lookup import ( @@ -53,6 +54,27 @@ def _get_vrs_1_3_ext(allele: Allele) -> Extension: ) +def _get_va_digest(allele: Allele) -> Extension: + """Return the VA digest for a pre-mapped allele + :param allele: A pre-mapped variant + :return A VRS extension reporting the pre-mapped digest + """ + return Extension(name="pre_mapped_id", value=allele.id) + + +def _is_valid_allele(allele: Allele, align_result: AlignmentResult) -> bool: + """Check if a post-mapped allele occurs within the alignment coverage + :param allele: A post-mapped allele + :param align_result: Alignment data + :return True if position occurs in coverage, False if not + """ + return ( + align_result.query_range.start + <= allele.location.start + <= align_result.query_range.end + ) + + def _offset_allele_ref_seq(ss: str, start: int, end: int) -> tuple[int, int]: """Handle known edge cases in start and end coordinates for vrs_ref_allele_seq.""" if ss.startswith("urn:mavedb:00000060-a-1"): @@ -94,7 +116,7 @@ def _get_vrs_ref_allele_seq( ref = sr.get_sequence(seq, start, end) if ref is None: raise ValueError - return Extension(name="vrs_ref_allele_seq", value=ref) + return SequenceString(root=ref) def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]: @@ -179,6 +201,7 @@ def _annotate_allele_mapping( mapped_score: MappedScore, tx_results: TxSelectResult | None, metadata: ScoresetMetadata, + align_result: AlignmentResult, ) -> ScoreAnnotationWithLayer: """Perform annotations for allele mappings.""" pre_mapped: Allele = mapped_score.pre_mapped @@ -186,9 +209,11 @@ def _annotate_allele_mapping( # get vrs_ref_allele_seq for pre-mapped variants pre_mapped.extensions = [ - _get_vrs_ref_allele_seq(pre_mapped, metadata, tx_results), _get_vrs_1_3_ext(pre_mapped), ] + pre_mapped.location.sequence = _get_vrs_ref_allele_seq( + pre_mapped, metadata, tx_results + ) # Determine reference sequence if mapped_score.annotation_layer == AnnotationLayer.GENOMIC: @@ -206,10 +231,12 @@ def _annotate_allele_mapping( sr = get_seqrepo() loc = mapped_score.post_mapped.location sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" - ref = sr.get_sequence(sequence_id, loc.start, loc.end) + post_mapped.location.sequence = SequenceString( + root=sr.get_sequence(sequence_id, loc.start, loc.end) + ) post_mapped.extensions = [ - Extension(name="vrs_ref_allele_seq", value=ref), _get_vrs_1_3_ext(post_mapped), + _get_va_digest(pre_mapped), ] hgvs_string, syntax = _get_hgvs_string(post_mapped, accession) post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)] @@ -217,6 +244,16 @@ def _annotate_allele_mapping( namespace = metadata.urn val = mapped_score.accession_id.split("#")[1] + # Check if post-mapped allele is valid + if mapped_score.annotation_layer == AnnotationLayer.GENOMIC: + post_mapped = ( + post_mapped if _is_valid_allele(pre_mapped, align_result) else None + ) + + # Remove extra digest attributes + pre_mapped.digest = None + post_mapped.digest = None + return ScoreAnnotationWithLayer( pre_mapped=pre_mapped, post_mapped=post_mapped, @@ -240,7 +277,10 @@ def _get_vrs_1_3_haplotype_id(cpb: CisPhasedBlock) -> str: def _annotate_cpb_mapping( - mapping: MappedScore, tx_results: TxSelectResult | None, metadata: ScoresetMetadata + mapping: MappedScore, + tx_results: TxSelectResult | None, + metadata: ScoresetMetadata, + align_result: AlignmentResult, ) -> ScoreAnnotationWithLayer: """Perform annotations and create VRS 1.3 equivalents for CisPhasedBlock mappings.""" pre_mapped: CisPhasedBlock = mapping.pre_mapped # type: ignore @@ -248,9 +288,10 @@ def _annotate_cpb_mapping( # get vrs_ref_allele_seq for pre-mapped variants for allele in pre_mapped.members: allele.extensions = [ - _get_vrs_ref_allele_seq(allele, metadata, tx_results), _get_vrs_1_3_ext(allele), ] + allele.location.sequence = _get_vrs_ref_allele_seq(allele, metadata, tx_results) + allele.digest = None # Determine reference sequence if mapping.annotation_layer == AnnotationLayer.GENOMIC: sequence_id = ( @@ -267,23 +308,37 @@ def _annotate_cpb_mapping( accession = tx_results.np sr = get_seqrepo() - for allele in post_mapped.members: - loc = allele.location + valid_post_mapped_alleles = [] + for post_mapped_allele, pre_mapped_allele in zip( + post_mapped.members, pre_mapped.members, strict=True + ): + loc = post_mapped_allele.location sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" - ref = sr.get_sequence(sequence_id, loc.start, loc.end) - allele.extensions = [ - Extension(name="vrs_ref_allele_seq", value=ref), - _get_vrs_1_3_ext(allele), + post_mapped_allele.location.sequence = SequenceString( + root=sr.get_sequence(sequence_id, loc.start, loc.end) + ) + post_mapped_allele.extensions = [ + _get_vrs_1_3_ext(post_mapped_allele), + _get_va_digest(pre_mapped_allele), ] - hgvs, syntax = _get_hgvs_string(allele, accession) - allele.expressions = [Expression(syntax=syntax, value=hgvs)] + hgvs, syntax = _get_hgvs_string(post_mapped_allele, accession) + post_mapped_allele.expressions = [Expression(syntax=syntax, value=hgvs)] + if mapping.annotation_layer == AnnotationLayer.PROTEIN or _is_valid_allele( + pre_mapped_allele, align_result + ): + valid_post_mapped_alleles.append(post_mapped_allele) + post_mapped_allele.digest = None + post_mapped.members = valid_post_mapped_alleles pre_mapped.extensions = [ Extension(name="vrs_v1.3_id", value=_get_vrs_1_3_haplotype_id(pre_mapped)) ] - post_mapped.extensions = [ - Extension(name="vrs_v1.3_id", value=_get_vrs_1_3_haplotype_id(post_mapped)) - ] + if len(post_mapped.members) >= 2: + post_mapped.extensions = [ + Extension(name="vrs_v1.3_id", value=_get_vrs_1_3_haplotype_id(post_mapped)), + ] + else: + post_mapped = post_mapped.members[0] namespace = metadata.urn val = mapping.accession_id.split("#")[1] @@ -301,6 +356,7 @@ def annotate( mapped_scores: list[MappedScore], tx_results: TxSelectResult | None, metadata: ScoresetMetadata, + align_result: AlignmentResult, ) -> list[ScoreAnnotationWithLayer]: """Given a list of mappings, add additional contextual data: @@ -316,6 +372,7 @@ def annotate( :param vrs_results: in-progress variant mappings :param tx_select_results: transcript selection if available :param metadata: MaveDB scoreset metadata + :param align_result: Alignment data :return: annotated mappings objects """ score_annotations = [] @@ -324,13 +381,15 @@ def annotate( mapped_score.post_mapped, CisPhasedBlock ): score_annotations.append( - _annotate_cpb_mapping(mapped_score, tx_results, metadata) + _annotate_cpb_mapping(mapped_score, tx_results, metadata, align_result) ) elif isinstance(mapped_score.pre_mapped, Allele) and isinstance( mapped_score.post_mapped, Allele ): score_annotations.append( - _annotate_allele_mapping(mapped_score, tx_results, metadata) + _annotate_allele_mapping( + mapped_score, tx_results, metadata, align_result + ) ) else: ValueError("inconsistent variant structure") diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 7b91226..3cc0863 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -197,7 +197,7 @@ async def map_scoreset( _emit_info("VRS mapping complete.", silent) _emit_info("Annotating metadata and saving to file...", silent) - vrs_results = annotate(vrs_results, transcript, metadata) + vrs_results = annotate(vrs_results, transcript, metadata, alignment_result) final_output = save_mapped_output_json( metadata.urn, vrs_results, diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 1730f76..e256bd6 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -18,9 +18,9 @@ class TargetSequenceType(str, Enum): class TargetType(str, Enum): """Define target gene types.""" - PROTEIN_CODING = "Protein coding" - REGULATORY = "Regulatory" - OTHER_NC = "Other noncoding" + PROTEIN_CODING = "protein_coding" + REGULATORY = "regulatory" + OTHER_NC = "other_noncoding" class UniProtRef(BaseModel): @@ -145,7 +145,7 @@ class MappedScore(BaseModel): annotation_layer: AnnotationLayer score: str | None pre_mapped: Allele | CisPhasedBlock - post_mapped: Allele | CisPhasedBlock | None + post_mapped: Allele | CisPhasedBlock class ScoreAnnotation(BaseModel): @@ -155,7 +155,7 @@ class ScoreAnnotation(BaseModel): """ pre_mapped: CisPhasedBlock | Allele - post_mapped: CisPhasedBlock | Allele + post_mapped: CisPhasedBlock | Allele | None mavedb_id: StrictStr relation: Literal["SO:is_homologous_to"] = "SO:is_homologous_to" score: float | None diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index 5b9d884..f54523d 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -462,20 +462,7 @@ def _get_variation( # Run ga4gh_identify to assign VA digest allele.id = ga4gh_identify(allele) - - # Check if the start of an allele is covered by the alignment block for - # post-mapped genomic variants - if layer == AnnotationLayer.GENOMIC: - if pre_map: - alleles.append(allele) - else: - if ( - allele.location.start >= alignment.hit_range.start - and allele.location.start < alignment.hit_range.end - ): - alleles.append(allele) - else: - alleles.append(allele) + alleles.append(allele) if not alleles: return None diff --git a/tests/fixtures/scoreset_metadata.json b/tests/fixtures/scoreset_metadata.json index 04e8b67..f0465db 100644 --- a/tests/fixtures/scoreset_metadata.json +++ b/tests/fixtures/scoreset_metadata.json @@ -3,7 +3,7 @@ { "urn": "urn:mavedb:00000002-a-2", "target_gene_name": "hYAP65 WW domain", - "target_gene_category": "Protein coding", + "target_gene_category": "protein_coding", "target_sequence": "GACGTTCCACTGCCGGCTGGTTGGGAAATGGCTAAAACTAGTTCTGGTCAGCGTTACTTCCTGAACCACATCGACCAGACCACCACGTGGCAGGACCCGCGT", "target_sequence_type": "dna", "target_uniprot_ref": { @@ -14,7 +14,7 @@ { "urn": "urn:mavedb:00000099-a-1", "target_gene_name": "RHO", - "target_gene_category": "Protein coding", + "target_gene_category": "protein_coding", "target_sequence": "ATGAATGGCACAGAAGGCCCTAACTTCTACGTGCCCTTCTCCAATGCGACGGGTGTGGTACGCAGCCCCTTCGAGTACCCACAGTACTACCTGGCTGAGCCATGGCAGTTCTCCATGCTGGCCGCCTACATGTTTCTGCTGATCGTGCTGGGCTTCCCCATCAACTTCCTCACGCTCTACGTCACCGTCCAGCACAAGAAGCTGCGCACGCCTCTCAACTACATCCTGCTCAACCTAGCCGTGGCTGACCTCTTCATGGTCCTAGGTGGCTTCACCAGCACCCTCTACACCTCTCTGCATGGATACTTCGTCTTCGGGCCCACAGGATGCAATTTGGAGGGCTTCTTTGCCACCCTGGGCGGTGAAATTGCCCTGTGGTCCTTGGTGGTCCTGGCCATCGAGCGGTACGTGGTGGTGTGTAAGCCCATGAGCAACTTCCGCTTCGGGGAGAACCATGCCATCATGGGCGTTGCCTTCACCTGGGTCATGGCGCTGGCCTGCGCCGCACCCCCACTCGCCGGCTGGTCCAGGTACATCCCCGAGGGCCTGCAGTGCTCGTGTGGAATCGACTACTACACGCTCAAGCCGGAGGTCAACAACGAGTCTTTTGTCATCTACATGTTCGTGGTCCACTTCACCATCCCCATGATTATCATCTTTTTCTGCTATGGGCAGCTCGTCTTCACCGTCAAGGAGGCCGCTGCCCAGCAGCAGGAGTCAGCCACCACACAGAAGGCAGAGAAGGAGGTCACCCGCATGGTCATCATCATGGTCATCGCTTTCCTGATCTGCTGGGTGCCCTACGCCAGCGTGGCATTCTACATCTTCACCCACCAGGGCTCCAACTTCGGTCCCATCTTCATGACCATCCCAGCGTTCTTTGCCAAGAGCGCCGCCATCTACAACCCTGTCATCTATATCATGATGAACAAGCAGTTCCGGAACTGCATGCTCACCACCATCTGCTGCGGCAAGAACCCACTGGGTGACGATGAGGCCTCTGCTACCGTGTCCAAGACGGAGACGAGCCAGGTGGCCCCGGCCTAA", "target_sequence_type": "dna", "target_uniprot_ref": null @@ -22,7 +22,7 @@ { "urn": "urn:mavedb:00000103-c-1", "target_gene_name": "MAPK1", - "target_gene_category": "Protein coding", + "target_gene_category": "protein_coding", "target_sequence": "MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS", "target_sequence_type": "protein", "target_uniprot_ref": null @@ -30,7 +30,7 @@ { "urn": "urn:mavedb:00000041-a-1", "target_gene_name": "Src catalytic domain", - "target_gene_category": "Protein coding", + "target_gene_category": "protein_coding", "target_sequence": "CTGCGGCTGGAGGTCAAGCTGGGCCAGGGCTGCTTTGGCGAGGTGTGGATGGGGACCTGGAACGGTACCACCAGGGTGGCCATCAAAACCCTGAAGCCTGGCACGATGTCTCCAGAGGCCTTCCTGCAGGAGGCCCAGGTCATGAAGAAGCTGAGGCATGAGAAGCTGGTGCAGTTGTATGCTGTGGTTTCAGAGGAGCCCATTTACATCGTCACGGAGTACATGAGCAAGGGGAGTTTGCTGGACTTTCTCAAGGGGGAGACAGGCAAGTACCTGCGGCTGCCTCAGCTGGTGGACATGGCTGCTCAGATCGCCTCAGGCATGGCGTACGTGGAGCGGATGAACTACGTCCACCGGGACCTTCGTGCAGCCAACATCCTGGTGGGAGAGAACCTGGTGTGCAAAGTGGCCGACTTTGGGCTGGCTCGGCTCATTGAAGACAATGAGTACACGGCGCGGCAAGGTGCCAAATTCCCCATCAAGTGGACGGCTCCAGAAGCTGCCCTCTATGGCCGCTTCACCATCAAGTCGGACGTGTGGTCCTTCGGGATCCTGCTGACTGAGCTCACCACAAAGGGACGGGTGCCCTACCCTGGGATGGTGAACCGCGAGGTGCTGGACCAGGTGGAGCGGGGCTACCGGATGCCCTGCCCGCCGGAGTGTCCCGAGTCCCTGCACGACCTCATGTGCCAGTGCTGGCGGAAGGAGCCTGAGGAGCGGCCCACCTTCGAGTACCTGCAGGCCTTCCTG", "target_sequence_type": "dna", "target_reference_genome": "hg38", @@ -42,7 +42,7 @@ { "urn": "urn:mavedb:00000018-a-1", "target_gene_name": "HBB promoter", - "target_gene_category": "Regulatory", + "target_gene_category": "regulatory", "target_sequence": "GGTGTCTGTTTGAGGTTGCTAGTGAACACAGTTGTGTCAGAAGCAAATGTAAGCAATAGATGGCTCTGCCCTGACTTTTATGCCCAGCCCTGGCTCCTGCCCTCCCTGCTCCTGGGAGTAGATTGGCCAACCCTAGGGTGTGGCTCCACAGGGTGAGGTCTAAGTGATGACAGCCGTACCTGTCCTT", "target_sequence_type": "dna", "target_reference_genome": "hg38", @@ -51,7 +51,7 @@ { "urn": "urn:mavedb:00000001-a-4", "target_gene_name": "UBE2I", - "target_gene_category": "Protein coding", + "target_gene_category": "protein_coding", "target_sequence": "ATGTCGGGGATCGCCCTCAGCAGACTCGCCCAGGAGAGGAAAGCATGGAGGAAAGACCACCCATTTGGTTTCGTGGCTGTCCCAACAAAAAATCCCGATGGCACGATGAACCTCATGAACTGGGAGTGCGCCATTCCAGGAAAGAAAGGGACTCCGTGGGAAGGAGGCTTGTTTAAACTACGGATGCTTTTCAAAGATGATTATCCATCTTCGCCACCAAAATGTAAATTCGAACCACCATTATTTCACCCGAATGTGTACCCTTCGGGGACAGTGTGCCTGTCCATCTTAGAGGAGGACAAGGACTGGAGGCCAGCCATCACAATCAAACAGATCCTATTAGGAATACAGGAACTTCTAAATGAACCAAATATCCAAGACCCAGCTCAAGCAGAGGCCTACACGATTTACTGCCAAAACAGAGTGGAGTACGAGAAAAGGGTCCGAGCACAAGCCAAGAAGTTTGCGCCCTCATAA", "target_sequence_type": "dna", "target_reference_genome": "hg38", @@ -63,7 +63,7 @@ { "urn": "urn:mavedb:00000113-a-2", "target_gene_name": "APP", - "target_gene_category": "Protein coding", + "target_gene_category": "protein_coding", "target_sequence": "DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA", "target_sequence_type": "protein", "target_reference_genome": "hg38", @@ -75,7 +75,7 @@ { "urn": "urn:mavedb:00000098-a-1", "target_gene_name": "SCN5A", - "target_gene_category": "Protein coding", + "target_gene_category": "protein_coding", "target_sequence": "LFRVIRLARIGR", "target_sequence_type": "protein", "target_reference_genome": "hg38", @@ -87,7 +87,7 @@ { "urn": "urn:mavedb:00000061-h-1", "target_gene_name": "RAF", - "target_gene_category": "Protein coding", + "target_gene_category": "protein_coding", "target_sequence": "TCTAAGACAAGCAACACTATCCGTGTTTTCTTGCCGAACAAGCAAAGAACAGTGGTCAATGTGCGAAATGGAATGAGCTTGCATGACTGCCTTATGAAAGCACTCAAGGTGAGGGGC", "target_sequence_type": "dna", "target_reference_genome": "hg38", @@ -99,7 +99,7 @@ { "urn": "urn:mavedb:00000068-a-1", "target_gene_name": "TP53 (P72R)", - "target_gene_category": "Protein coding", + "target_gene_category": "protein_coding", "target_sequence": "ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCGCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTAG", "target_sequence_type": "dna", "target_reference_genome": "hg38", @@ -108,7 +108,7 @@ { "urn": "urn:mavedb:00000093-a-1", "target_gene_name": "BRCA1 translation start through RING domain", - "target_gene_category": "Protein coding", + "target_gene_category": "protein_coding", "target_sequence": "ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGGTTTGGAG", "target_sequence_type": "dna", "target_reference_genome": "hg19", @@ -120,7 +120,7 @@ { "urn": "urn:mavedb:00000001-b-2", "target_gene_name": "SUMO1", - "target_gene_category": "Protein coding", + "target_gene_category": "protein_coding", "target_sequence": "ATGTCTGACCAGGAGGCAAAACCTTCAACTGAGGACTTGGGGGATAAGAAGGAAGGTGAATATATTAAACTCAAAGTCATTGGACAGGATAGCAGTGAGATTCACTTCAAAGTGAAAATGACAACACATCTCAAGAAACTCAAAGAATCATACTGTCAAAGACAGGGTGTTCCAATGAATTCACTCAGGTTTCTCTTTGAGGGTCAGAGAATTGCTGATAATCATACTCCAAAAGAACTGGGAATGGAGGAAGAAGATGTGATTGAAGTTTATCAGGAACAAACGGGGGGTCATTCAACAGTTTAG", "target_sequence_type": "dna", "target_uniprot_ref": { diff --git a/tests/fixtures/scoreset_metadata_response.json b/tests/fixtures/scoreset_metadata_response.json index 58ad764..74187d4 100644 --- a/tests/fixtures/scoreset_metadata_response.json +++ b/tests/fixtures/scoreset_metadata_response.json @@ -166,7 +166,7 @@ "targetGenes": [ { "name": "BRCA1 translation start through RING domain", - "category": "Protein coding", + "category": "protein_coding", "externalIdentifiers": [ { "identifier": { diff --git a/tests/test_annotate.py b/tests/test_annotate.py index 9a48023..c46e3a3 100644 --- a/tests/test_annotate.py +++ b/tests/test_annotate.py @@ -5,7 +5,12 @@ import pytest from dcd_mapping.annotate import annotate -from dcd_mapping.schemas import MappedScore, ScoresetMetadata, TxSelectResult +from dcd_mapping.schemas import ( + AlignmentResult, + MappedScore, + ScoresetMetadata, + TxSelectResult, +) @pytest.fixture() @@ -13,12 +18,14 @@ def get_fixtures( scoreset_metadata_fixture: dict[str, ScoresetMetadata], transcript_results_fixture: dict[str, TxSelectResult], mapped_scores_fixture: dict[str, list[MappedScore]], + align_result_fixture: dict[str, AlignmentResult], ): def _get_fixtures(urn: str): return ( mapped_scores_fixture[urn], transcript_results_fixture[urn], scoreset_metadata_fixture[urn], + align_result_fixture[urn], ) return _get_fixtures @@ -26,9 +33,9 @@ def _get_fixtures(urn: str): def test_2_a_2(get_fixtures, mock_seqrepo_access: MagicMock): # noqa: ARG001 urn = "urn:mavedb:00000002-a-2" - mapped_scores, tx_results, metadata = get_fixtures(urn) + mapped_scores, tx_results, metadata, align_result = get_fixtures(urn) - annotate_result = annotate(mapped_scores, tx_results, metadata) + annotate_result = annotate(mapped_scores, tx_results, metadata, align_result) expected_list = [ {