33"""
44
55import logging
6+ import re
67
78import polars as pl
89from civicpy .civic import ExonCoordinate , MolecularProfile
910from cool_seq_tool .schemas import Assembly , CoordinateType
1011from ga4gh .core .models import MappableConcept
12+ from ga4gh .vrs .models import LiteralSequenceExpression
1113from pydantic import BaseModel
1214
1315from fusor .fusion_caller_models import (
2426)
2527from fusor .fusor import FUSOR
2628from fusor .models import (
29+ LINKER_REGEX ,
2730 AnchoredReads ,
2831 Assay ,
2932 AssayedFusion ,
3336 ContigSequence ,
3437 EventType ,
3538 GeneElement ,
39+ LinkerElement ,
3640 MultiplePossibleGenesElement ,
3741 ReadData ,
3842 SpanningReads ,
@@ -76,6 +80,7 @@ def _format_fusion(
7680 rf : bool | None = None ,
7781 assay : Assay | None = None ,
7882 contig : ContigSequence | None = None ,
83+ linker_sequence : LinkerElement | None = None ,
7984 reads : ReadData | None = None ,
8085 molecular_profiles : list [MolecularProfile ] | None = None ,
8186 ) -> AssayedFusion | CategoricalFusion :
@@ -90,6 +95,7 @@ def _format_fusion(
9095 :param rf: A boolean indicating if the reading frame is preserved
9196 :param assay: Assay
9297 :param contig: The contig sequence
98+ :param linker_sequence: The non-template linker sequence
9399 :param reads: The read data
94100 :return AssayedFusion or CategoricalFusion object
95101 """
@@ -109,6 +115,8 @@ def _format_fusion(
109115 params ["structure" ] = [gene_5prime , tr_3prime ]
110116 else :
111117 params ["structure" ] = [tr_5prime , tr_3prime ]
118+ if linker_sequence :
119+ params ["structure" ].insert (1 , linker_sequence )
112120 return fusion_type (** params )
113121
114122 def _get_causative_event (
@@ -585,10 +593,22 @@ async def from_arriba(
585593 )
586594 )
587595 rf = bool (arriba .rf == "in-frame" ) if arriba .rf != "." else None
596+
597+ # Process read data and fusion_transcript sequence
588598 read_data = ReadData (
589599 spanning = SpanningReads (spanningReads = arriba .discordant_mates )
590600 )
591601 contig = ContigSequence (contig = arriba .fusion_transcript )
602+ linker_sequence = re .search (LINKER_REGEX , arriba .fusion_transcript )
603+ linker_sequence = (
604+ LinkerElement (
605+ linkerSequence = LiteralSequenceExpression (
606+ sequence = linker_sequence .group (1 ).upper ()
607+ )
608+ )
609+ if linker_sequence
610+ else None
611+ )
592612
593613 return self ._format_fusion (
594614 AssayedFusion ,
@@ -604,6 +624,7 @@ async def from_arriba(
604624 rf ,
605625 contig = contig ,
606626 reads = read_data ,
627+ linker_sequence = linker_sequence ,
607628 )
608629
609630 async def from_cicero (
0 commit comments