@@ -8,18 +8,21 @@ import sortedcontainers
88from cpython cimport array
99import array
1010import logging
11- from dysgu.map_set_utils cimport Py_SimpleGraph
12- from dysgu.map_set_utils cimport multimap as cpp_map
13- from dysgu cimport map_set_utils
11+
1412from dysgu.io_funcs import intersecter
13+
14+ from dysgu.map_set_utils cimport Py_SimpleGraph, Py_Int2IntMap
15+ from dysgu.map_set_utils cimport multimap as cpp_map
1516from dysgu.map_set_utils cimport cigar_clip, clip_sizes, clip_sizes_hard, is_reciprocal_overlapping, span_position_distance
1617from dysgu.map_set_utils cimport hash as xxhasher
1718from dysgu.map_set_utils cimport MinimizerTable
1819from dysgu.map_set_utils cimport set as ankerl_set
1920from dysgu.map_set_utils cimport map as ankerl_map
21+ from dysgu.map_set_utils cimport TranscriptData
22+ from dysgu.map_set_utils import echo # for debugging
2023
2124from dysgu.extra_metrics import BadClipCounter
22- from dysgu.map_set_utils import echo # for debugging
25+
2326from libcpp.string cimport string
2427from libcpp.deque cimport deque as cpp_deque
2528from libcpp.vector cimport vector
@@ -32,20 +35,11 @@ from pysam.libchtslib cimport bam_get_qname, bam_seqi, bam_get_seq, bam_get_ciga
3235
3336ctypedef cpp_pair[int , int ] cpp_item
3437
35- ctypedef map_set_utils.Py_IntSet Py_IntSet
36- ctypedef map_set_utils.Py_Int2IntMap Py_Int2IntMap
37-
3838ctypedef cpp_map[int , cpp_item] ScopeItem_t
3939ctypedef vector[ScopeItem_t] ForwardScope_t
4040
4141ctypedef cpp_pair[int , cpp_item] event_item
4242ctypedef cpp_pair[long , int ] cpp_long_pair
43- ctypedef long int long_int
44-
45- ctypedef PairedEndScoper PairedEndScoper_t
46- ctypedef TemplateEdges TemplateEdges_t
47- ctypedef NodeToName NodeToName_t
48- ctypedef ClipScoper ClipScoper_t
4943
5044
5145# 1 = soft-clipped split-read, the supplementary mapping might be discarded. whole read is a node
@@ -794,7 +788,7 @@ class AlignmentsSA:
794788 self .join_result.append(JoinEvent(chrom, event_pos, chrom2, pos2, query_pos, query_end, read_enum, cigar_index))
795789
796790#
797- cdef int cluster_clipped(Py_SimpleGraph G, r, ClipScoper_t clip_scope, chrom, pos, node_name):
791+ cdef int cluster_clipped(Py_SimpleGraph G, r, ClipScoper clip_scope, chrom, pos, node_name):
798792 cdef int other_node
799793 cdef int count = 0
800794 cdef ankerl_set[int ] clustered_nodes
@@ -807,10 +801,10 @@ cdef int cluster_clipped(Py_SimpleGraph G, r, ClipScoper_t clip_scope, chrom, po
807801 return count
808802
809803
810- cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_scope, TemplateEdges_t template_edges,
804+ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper pe_scope, TemplateEdges template_edges,
811805 NodeToName node_to_name, genome_scanner,
812806 int flag, int chrom, tell, int cigar_index, int event_pos, int query_pos,
813- int chrom2, int pos2, ClipScoper_t clip_scope, ReadEnum_t read_enum,
807+ int chrom2, int pos2, ClipScoper clip_scope, ReadEnum_t read_enum,
814808 bint p1_overlaps, bint p2_overlaps, bint mm_only, int clip_l, site_adder,
815809 int length_from_cigar, bint trust_ins_len, bint paired_end):
816810 # Adds relevant information to graph and other data structures for further processing
@@ -962,10 +956,10 @@ cdef int good_quality_clip(AlignedSegment r, int clip_length):
962956
963957
964958cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int loci_dist, gettid,
965- overlap_regions, int clustering_dist, PairedEndScoper_t pe_scope,
959+ overlap_regions, int clustering_dist, PairedEndScoper pe_scope,
966960 int cigar_index, int event_pos, int paired_end, long tell, genome_scanner,
967- TemplateEdges_t template_edges, NodeToName node_to_name,
968- int cigar_pos2, int mapq_thresh, ClipScoper_t clip_scope,
961+ TemplateEdges template_edges, NodeToName node_to_name,
962+ int cigar_pos2, int mapq_thresh, ClipScoper clip_scope,
969963 ReadEnum_t read_enum, bad_clip_counter, bint mm_only, site_adder,
970964 int length_from_cigar, bint trust_ins_len):
971965 cdef int other_node, clip_left, clip_right
@@ -1179,7 +1173,7 @@ class SiteAdder:
11791173 else :
11801174 self .scope = sortedcontainers.sortedlist.SortedList([site], key = lambda x : x.start)
11811175 self .current_chrom = site.chrom
1182- def add_any_sites (self , int chrom , int pos , Py_SimpleGraph G , PairedEndScoper_t pe_scope , NodeToName node_to_name , cluster_dist ):
1176+ def add_any_sites (self , int chrom , int pos , Py_SimpleGraph G , PairedEndScoper pe_scope , NodeToName node_to_name , cluster_dist ):
11831177 cdef int node_name, start, stop, file_index
11841178 cdef ReadEnum_t read_enum
11851179 if chrom not in self .sites_queue:
@@ -1247,28 +1241,39 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
12471241 float norm_thresh = 100 , float spd_thresh = 0.3 , bint mm_only = False ,
12481242 sites = None , bint trust_ins_len = True , bint low_mem = False , temp_dir = " ." ,
12491243 bint find_n_aligned_bases = True , float position_distance_thresh = 0.8 , int max_search_depth = 20 ,
1250- float max_divergence = 0.2 , bint no_phase = False ):
1244+ float max_divergence = 0.2 , bint no_phase = False , transcript_gaps_file = " " ):
12511245 logging.info(" Building cluster graph" )
12521246
12531247 # Edges are added between alignments from same template, after building main graph
1254- cdef TemplateEdges_t template_edges = TemplateEdges()
1248+ cdef TemplateEdges template_edges = TemplateEdges()
12551249
12561250 # Map of nodes -> read ids
12571251 cdef NodeToName node_to_name = NodeToName()
12581252
12591253 # Keeps track of local reads
1260- cdef ClipScoper_t clip_scope = ClipScoper(minimizer_dist, k = k, m = m, clip_length = clip_l,
1254+ cdef ClipScoper clip_scope = ClipScoper(minimizer_dist, k = k, m = m, clip_length = clip_l,
12611255 minimizer_support_thresh = minimizer_support_thresh,
12621256 minimizer_breadth = minimizer_breadth, read_length = read_length)
12631257
12641258 # Infers long-range connections, outside local scope using pe information
1265- cdef PairedEndScoper_t pe_scope = PairedEndScoper(max_dist, clustering_dist, infile.header.nreferences, norm_thresh, spd_thresh, paired_end, position_distance_thresh, max_search_depth)
1259+ cdef PairedEndScoper pe_scope = PairedEndScoper(max_dist, clustering_dist, infile.header.nreferences, norm_thresh, spd_thresh, paired_end, position_distance_thresh, max_search_depth)
12661260
12671261 # Counts poor quality soft-clips
12681262 bad_clip_counter = BadClipCounter(infile.header.nreferences, low_mem, temp_dir)
12691263
12701264 # The main graph for clustering variant information
1271- cdef Py_SimpleGraph G = map_set_utils.Py_SimpleGraph()
1265+ cdef Py_SimpleGraph G = Py_SimpleGraph()
1266+
1267+ # Table of expected gaps from transcripts file
1268+ cdef TranscriptData transcript_gaps = TranscriptData()
1269+ cdef vector[string] chrom_names
1270+ cdef bytes chr_b
1271+ cdef int i
1272+ if transcript_gaps_file:
1273+ transcript_gaps.readBed(transcript_gaps_file)
1274+ for i in range (infile.header.nreferences):
1275+ chr_b = infile.get_reference_name(i).encode(" ascii" )
1276+ chrom_names.push_back(chr_b)
12721277
12731278 site_adder = None
12741279 if sites:
@@ -1293,6 +1298,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
12931298 cdef int left_clip_size, right_clip_size
12941299
12951300 cdef bint hp_tag_found = False
1301+ cdef bint is_transcript_gap
12961302 cdef int n_checked_for_hp_tag = 0
12971303 if no_phase:
12981304 n_checked_for_hp_tag = 10 _001
@@ -1337,22 +1343,25 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
13371343 event_pos += length
13381344 continue
13391345
1340- if opp == 1 :
1346+ if opp == 1 : # INS
13411347 if length >= min_sv_size:
13421348 pos2 = event_pos + length
13431349 events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION))
13441350 added = True
1345- elif opp == 2 :
1351+ elif opp == 2 : # DEL
13461352 if length >= min_sv_size:
13471353 pos2 = event_pos + length
13481354 events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION))
13491355 added = True
13501356 event_pos += length
1351- elif opp == 3 :
1357+ elif opp == 3 : # SKIP
13521358 if length >= min_sv_size:
1353- pos2 = event_pos + length
1354- events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP))
1355- added = True
1359+ if r.rname < chrom_names.size() and transcript_gaps.any_data:
1360+ pos2 = event_pos + length
1361+ is_transcript_gap = transcript_gaps.hasRefSkipGap(chrom_names[r.rname], event_pos, pos2, 10 )
1362+ if not is_transcript_gap:
1363+ events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP))
1364+ added = True
13561365 event_pos += length
13571366 else :
13581367 if opp != 4 and opp != 5 :
@@ -1480,7 +1489,7 @@ cdef tuple count_support_between(Py_SimpleGraph G, parts):
14801489 return None , None
14811490 elif len (parts) == 1 :
14821491 return None , {0 : parts[0 ]}
1483- cdef Py_Int2IntMap p2i = map_set_utils. Py_Int2IntMap()
1492+ cdef Py_Int2IntMap p2i = Py_Int2IntMap()
14841493 for i, p in enumerate (parts):
14851494 for node in p:
14861495 p2i.insert(node, i)
@@ -1538,7 +1547,7 @@ cpdef break_large_component(Py_SimpleGraph G, component, int min_support):
15381547 if len (parts) <= 1 :
15391548 return parts
15401549 # Make a table to count from, int-int
1541- cdef Py_Int2IntMap p2i = map_set_utils. Py_Int2IntMap()
1550+ cdef Py_Int2IntMap p2i = Py_Int2IntMap()
15421551 for i, p in enumerate (parts):
15431552 for node in p:
15441553 p2i.insert(node, i)
0 commit comments