Skip to content

Commit 48c475d

Browse files
committed
#151 fixed TypeError. More work on trascript data
1 parent be844eb commit 48c475d

File tree

11 files changed

+682
-245
lines changed

11 files changed

+682
-245
lines changed

dysgu/cluster.pyx

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ from sys import stdout
1414
import pandas as pd
1515
from dysgu import coverage, graph, call_component, consensus, io_funcs, re_map, post_call, merge_svs, extra_metrics
1616
from dysgu.map_set_utils cimport EventResult, Py_SimpleGraph
17-
from dysgu.map_set_utils import to_dict, echo
17+
from dysgu.map_set_utils import to_dict, filter_transcript_gaps, echo
1818
from dysgu import sites_utils
1919
from dysgu.io_funcs import intersecter
2020
import pickle
@@ -143,13 +143,20 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
143143
else:
144144
cov_track_path = None
145145

146+
# Look for aligned bases file in working directory
146147
if os.path.exists(os.path.join(tdir, "n_aligned_bases.txt")):
147148
n_aligned_bases_file = int(open(os.path.join(tdir, "n_aligned_bases.txt"), "r").readline().strip())
148149
assert n_aligned_bases_file > 0
149150
find_n_aligned_bases = False
150151
else:
151152
find_n_aligned_bases = True
152153

154+
# Look for gaps file in working directory
155+
if os.path.exists(os.path.join(tdir, "unique_gaps_transcripts.bed")):
156+
transcript_gaps_file = os.path.join(tdir, "unique_gaps_transcripts.bed")
157+
else:
158+
transcript_gaps_file = ""
159+
153160
genome_scanner = coverage.GenomeScanner(infile, args["mq"], args["max_cov"], args["regions"], procs,
154161
args["buffer_size"], regions_only,
155162
kind == "stdin",
@@ -241,7 +248,8 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
241248
position_distance_thresh=args['sd'],
242249
max_search_depth=args['search_depth'],
243250
max_divergence=max_divergence,
244-
no_phase=args['no_phase']
251+
no_phase=args['no_phase'],
252+
transcript_gaps_file=transcript_gaps_file
245253
)
246254

247255
sites_index = None
@@ -459,22 +467,19 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
459467
keeps = len([i for i in block_edge_events if i.site_info])
460468
if keeps:
461469
logging.info("Number of matching SVs from sites: {}".format(keeps))
462-
preliminaries = []
470+
463471
if args["remap"] == "True" and args["contigs"] == "True":
464472
block_edge_events = re_map.remap_soft_clips(block_edge_events, ref_genome,
465473
keep_unmapped=True if args["pl"] == "pe" else False,
466474
min_support=min_support)
467475
logging.info("Re-alignment of soft-clips done. N candidates: {}".format(len(block_edge_events)))
476+
477+
block_edge_events = filter_transcript_gaps(block_edge_events, transcript_gaps_file)
468478

469-
block_edge_events = consensus.contig_info(block_edge_events) # GC info, repetitiveness
479+
# GC info, repetitiveness
480+
consensus.contig_info(block_edge_events)
481+
consensus.order_posA_first(block_edge_events, args)
470482

471-
cdef EventResult_t d
472-
for d in block_edge_events:
473-
d.type = args["pl"]
474-
if d.posA > d.posB and d.chrA == d.chrB:
475-
t = d.posA
476-
d.posA = d.posB
477-
d.posB = t
478483

479484
# Merge across calls
480485
if args["merge_within"] == "True":

dysgu/consensus.pyx

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -914,8 +914,17 @@ cdef tuple get_rep(contig_seq):
914914
return aligned_portion, clip_rep, right_clip_start - left_clip_end
915915

916916

917-
def contig_info(events):
917+
def order_posA_first(events, args):
918+
cdef EventResult_t d
919+
for d in events:
920+
d.type = args["pl"]
921+
if d.posA > d.posB and d.chrA == d.chrB:
922+
t = d.posA
923+
d.posA = d.posB
924+
d.posB = t
925+
918926

927+
def contig_info(events):
919928
cdef EventResult_t e
920929
for i in range(len(events)):
921930
e = events[i]
@@ -966,8 +975,6 @@ def contig_info(events):
966975
e.rep_sc = round(sc_rep, 3)
967976
e.ref_bases = aligned
968977

969-
return events
970-
971978

972979
def check_contig_match(a, b, rel_diffs=False, diffs=8, ol_length=21, supress_seq=True, return_int=False):
973980
if not a or not b:

dysgu/graph.pyx

Lines changed: 42 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,21 @@ import sortedcontainers
88
from cpython cimport array
99
import array
1010
import logging
11-
from dysgu.map_set_utils cimport Py_SimpleGraph
12-
from dysgu.map_set_utils cimport multimap as cpp_map
13-
from dysgu cimport map_set_utils
11+
1412
from dysgu.io_funcs import intersecter
13+
14+
from dysgu.map_set_utils cimport Py_SimpleGraph, Py_Int2IntMap
15+
from dysgu.map_set_utils cimport multimap as cpp_map
1516
from dysgu.map_set_utils cimport cigar_clip, clip_sizes, clip_sizes_hard, is_reciprocal_overlapping, span_position_distance
1617
from dysgu.map_set_utils cimport hash as xxhasher
1718
from dysgu.map_set_utils cimport MinimizerTable
1819
from dysgu.map_set_utils cimport set as ankerl_set
1920
from dysgu.map_set_utils cimport map as ankerl_map
21+
from dysgu.map_set_utils cimport TranscriptData
22+
from dysgu.map_set_utils import echo # for debugging
2023

2124
from dysgu.extra_metrics import BadClipCounter
22-
from dysgu.map_set_utils import echo # for debugging
25+
2326
from libcpp.string cimport string
2427
from libcpp.deque cimport deque as cpp_deque
2528
from libcpp.vector cimport vector
@@ -32,20 +35,11 @@ from pysam.libchtslib cimport bam_get_qname, bam_seqi, bam_get_seq, bam_get_ciga
3235

3336
ctypedef cpp_pair[int, int] cpp_item
3437

35-
ctypedef map_set_utils.Py_IntSet Py_IntSet
36-
ctypedef map_set_utils.Py_Int2IntMap Py_Int2IntMap
37-
3838
ctypedef cpp_map[int, cpp_item] ScopeItem_t
3939
ctypedef vector[ScopeItem_t] ForwardScope_t
4040

4141
ctypedef cpp_pair[int, cpp_item] event_item
4242
ctypedef cpp_pair[long, int] cpp_long_pair
43-
ctypedef long int long_int
44-
45-
ctypedef PairedEndScoper PairedEndScoper_t
46-
ctypedef TemplateEdges TemplateEdges_t
47-
ctypedef NodeToName NodeToName_t
48-
ctypedef ClipScoper ClipScoper_t
4943

5044

5145
# 1 = soft-clipped split-read, the supplementary mapping might be discarded. whole read is a node
@@ -794,7 +788,7 @@ class AlignmentsSA:
794788
self.join_result.append(JoinEvent(chrom, event_pos, chrom2, pos2, query_pos, query_end, read_enum, cigar_index))
795789

796790
#
797-
cdef int cluster_clipped(Py_SimpleGraph G, r, ClipScoper_t clip_scope, chrom, pos, node_name):
791+
cdef int cluster_clipped(Py_SimpleGraph G, r, ClipScoper clip_scope, chrom, pos, node_name):
798792
cdef int other_node
799793
cdef int count = 0
800794
cdef ankerl_set[int] clustered_nodes
@@ -807,10 +801,10 @@ cdef int cluster_clipped(Py_SimpleGraph G, r, ClipScoper_t clip_scope, chrom, po
807801
return count
808802

809803

810-
cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_scope, TemplateEdges_t template_edges,
804+
cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper pe_scope, TemplateEdges template_edges,
811805
NodeToName node_to_name, genome_scanner,
812806
int flag, int chrom, tell, int cigar_index, int event_pos, int query_pos,
813-
int chrom2, int pos2, ClipScoper_t clip_scope, ReadEnum_t read_enum,
807+
int chrom2, int pos2, ClipScoper clip_scope, ReadEnum_t read_enum,
814808
bint p1_overlaps, bint p2_overlaps, bint mm_only, int clip_l, site_adder,
815809
int length_from_cigar, bint trust_ins_len, bint paired_end):
816810
# Adds relevant information to graph and other data structures for further processing
@@ -962,10 +956,10 @@ cdef int good_quality_clip(AlignedSegment r, int clip_length):
962956

963957

964958
cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int loci_dist, gettid,
965-
overlap_regions, int clustering_dist, PairedEndScoper_t pe_scope,
959+
overlap_regions, int clustering_dist, PairedEndScoper pe_scope,
966960
int cigar_index, int event_pos, int paired_end, long tell, genome_scanner,
967-
TemplateEdges_t template_edges, NodeToName node_to_name,
968-
int cigar_pos2, int mapq_thresh, ClipScoper_t clip_scope,
961+
TemplateEdges template_edges, NodeToName node_to_name,
962+
int cigar_pos2, int mapq_thresh, ClipScoper clip_scope,
969963
ReadEnum_t read_enum, bad_clip_counter, bint mm_only, site_adder,
970964
int length_from_cigar, bint trust_ins_len):
971965
cdef int other_node, clip_left, clip_right
@@ -1179,7 +1173,7 @@ class SiteAdder:
11791173
else:
11801174
self.scope = sortedcontainers.sortedlist.SortedList([site], key=lambda x: x.start)
11811175
self.current_chrom = site.chrom
1182-
def add_any_sites(self, int chrom, int pos, Py_SimpleGraph G, PairedEndScoper_t pe_scope, NodeToName node_to_name, cluster_dist):
1176+
def add_any_sites(self, int chrom, int pos, Py_SimpleGraph G, PairedEndScoper pe_scope, NodeToName node_to_name, cluster_dist):
11831177
cdef int node_name, start, stop, file_index
11841178
cdef ReadEnum_t read_enum
11851179
if chrom not in self.sites_queue:
@@ -1247,28 +1241,39 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
12471241
float norm_thresh=100, float spd_thresh=0.3, bint mm_only=False,
12481242
sites=None, bint trust_ins_len=True, bint low_mem=False, temp_dir=".",
12491243
bint find_n_aligned_bases=True, float position_distance_thresh=0.8, int max_search_depth=20,
1250-
float max_divergence=0.2, bint no_phase=False):
1244+
float max_divergence=0.2, bint no_phase=False, transcript_gaps_file=""):
12511245
logging.info("Building cluster graph")
12521246

12531247
# Edges are added between alignments from same template, after building main graph
1254-
cdef TemplateEdges_t template_edges = TemplateEdges()
1248+
cdef TemplateEdges template_edges = TemplateEdges()
12551249

12561250
# Map of nodes -> read ids
12571251
cdef NodeToName node_to_name = NodeToName()
12581252

12591253
# Keeps track of local reads
1260-
cdef ClipScoper_t clip_scope = ClipScoper(minimizer_dist, k=k, m=m, clip_length=clip_l,
1254+
cdef ClipScoper clip_scope = ClipScoper(minimizer_dist, k=k, m=m, clip_length=clip_l,
12611255
minimizer_support_thresh=minimizer_support_thresh,
12621256
minimizer_breadth=minimizer_breadth, read_length=read_length)
12631257

12641258
# Infers long-range connections, outside local scope using pe information
1265-
cdef PairedEndScoper_t pe_scope = PairedEndScoper(max_dist, clustering_dist, infile.header.nreferences, norm_thresh, spd_thresh, paired_end, position_distance_thresh, max_search_depth)
1259+
cdef PairedEndScoper pe_scope = PairedEndScoper(max_dist, clustering_dist, infile.header.nreferences, norm_thresh, spd_thresh, paired_end, position_distance_thresh, max_search_depth)
12661260

12671261
# Counts poor quality soft-clips
12681262
bad_clip_counter = BadClipCounter(infile.header.nreferences, low_mem, temp_dir)
12691263

12701264
# The main graph for clustering variant information
1271-
cdef Py_SimpleGraph G = map_set_utils.Py_SimpleGraph()
1265+
cdef Py_SimpleGraph G = Py_SimpleGraph()
1266+
1267+
# Table of expected gaps from transcripts file
1268+
cdef TranscriptData transcript_gaps = TranscriptData()
1269+
cdef vector[string] chrom_names
1270+
cdef bytes chr_b
1271+
cdef int i
1272+
if transcript_gaps_file:
1273+
transcript_gaps.readBed(transcript_gaps_file)
1274+
for i in range(infile.header.nreferences):
1275+
chr_b = infile.get_reference_name(i).encode("ascii")
1276+
chrom_names.push_back(chr_b)
12721277

12731278
site_adder = None
12741279
if sites:
@@ -1293,6 +1298,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
12931298
cdef int left_clip_size, right_clip_size
12941299

12951300
cdef bint hp_tag_found = False
1301+
cdef bint is_transcript_gap
12961302
cdef int n_checked_for_hp_tag = 0
12971303
if no_phase:
12981304
n_checked_for_hp_tag = 10_001
@@ -1337,22 +1343,25 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
13371343
event_pos += length
13381344
continue
13391345

1340-
if opp == 1:
1346+
if opp == 1: # INS
13411347
if length >= min_sv_size:
13421348
pos2 = event_pos + length
13431349
events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION))
13441350
added = True
1345-
elif opp == 2:
1351+
elif opp == 2: # DEL
13461352
if length >= min_sv_size:
13471353
pos2 = event_pos + length
13481354
events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION))
13491355
added = True
13501356
event_pos += length
1351-
elif opp == 3:
1357+
elif opp == 3: # SKIP
13521358
if length >= min_sv_size:
1353-
pos2 = event_pos + length
1354-
events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP))
1355-
added = True
1359+
if r.rname < chrom_names.size() and transcript_gaps.any_data:
1360+
pos2 = event_pos + length
1361+
is_transcript_gap = transcript_gaps.hasRefSkipGap(chrom_names[r.rname], event_pos, pos2, 10)
1362+
if not is_transcript_gap:
1363+
events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.SKIP))
1364+
added = True
13561365
event_pos += length
13571366
else:
13581367
if opp != 4 and opp != 5:
@@ -1480,7 +1489,7 @@ cdef tuple count_support_between(Py_SimpleGraph G, parts):
14801489
return None, None
14811490
elif len(parts) == 1:
14821491
return None, {0: parts[0]}
1483-
cdef Py_Int2IntMap p2i = map_set_utils.Py_Int2IntMap()
1492+
cdef Py_Int2IntMap p2i = Py_Int2IntMap()
14841493
for i, p in enumerate(parts):
14851494
for node in p:
14861495
p2i.insert(node, i)
@@ -1538,7 +1547,7 @@ cpdef break_large_component(Py_SimpleGraph G, component, int min_support):
15381547
if len(parts) <= 1:
15391548
return parts
15401549
# Make a table to count from, int-int
1541-
cdef Py_Int2IntMap p2i = map_set_utils.Py_Int2IntMap()
1550+
cdef Py_Int2IntMap p2i = Py_Int2IntMap()
15421551
for i, p in enumerate(parts):
15431552
for node in p:
15441553
p2i.insert(node, i)

dysgu/include/find_reads.hpp

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -229,37 +229,52 @@ int process_alignment(int& current_tid, std::string& current_chrom, std::deque<s
229229
}
230230

231231
int index_start = aln->core.pos;
232-
for (uint32_t k=0; k < n_cigar; k++) {
232+
233+
for (uint32_t k = 0; k < n_cigar; k++) {
233234
uint32_t op = bam_cigar_op(cigar[k]);
234235
uint32_t length = bam_cigar_oplen(cigar[k]);
235-
if (op == BAM_CDEL) {
236-
index_start += length;
237236

238-
} else if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
239-
int index_end = index_start + length;
240-
cov_track.add(index_start, index_end);
241-
index_start = index_end;
242-
*n_aligned_bases += length;
237+
// Process different CIGAR operations
238+
switch (op) {
239+
case BAM_CDEL:
240+
index_start += length;
241+
break;
243242

244-
} else if (op == BAM_CREF_SKIP) {
245-
if (transcript_gaps.any_data) {
243+
case BAM_CMATCH:
244+
case BAM_CEQUAL:
245+
case BAM_CDIFF: {
246246
int index_end = index_start + length;
247-
bool has_ref_skip_gap = transcript_gaps.hasRefSkipGap(current_chrom, index_start, index_end, 10);
248-
sv_read = (has_ref_skip_gap) ? false : true;
249-
if (has_ref_skip_gap) {
250-
break;
251-
}
247+
cov_track.add(index_start, index_end);
248+
index_start = index_end;
249+
*n_aligned_bases += length;
250+
break;
252251
}
253-
index_start += length;
252+
253+
case BAM_CREF_SKIP:
254+
if (transcript_gaps.hasRefSkipGap(current_chrom, index_start,
255+
index_start + length, 10)) {
256+
sv_read = false;
257+
goto loop_exit; // Break out of the entire loop
258+
} else {
259+
sv_read = true;
260+
}
261+
index_start += length;
262+
break;
254263
}
264+
265+
// Check for structural variant indicators
255266
if (!sv_read) {
256-
if ((check_clips) && (op == BAM_CSOFT_CLIP) && (length >= clip_length)) {
257-
sv_read = true;
258-
} else if ((op == BAM_CINS || op == BAM_CDEL) && (length >= min_within_size)) {
267+
bool is_large_soft_clip = (check_clips && op == BAM_CSOFT_CLIP &&
268+
length >= clip_length);
269+
bool is_large_indel = ((op == BAM_CINS || op == BAM_CDEL) &&
270+
length >= min_within_size);
271+
272+
if (is_large_soft_clip || is_large_indel) {
259273
sv_read = true;
260274
}
261275
}
262276
}
277+
loop_exit:
263278

264279
if (!sv_read) { // not an sv read template yet
265280
// Check for discordant of supplementary
@@ -281,7 +296,7 @@ int process_alignment(int& current_tid, std::string& current_chrom, std::deque<s
281296

282297
void collect_transcripts(const char* transcripts_file, const char* unique_gaps_file, Tr::TranscriptData& t_reader) {
283298
if (transcripts_file == nullptr || strlen(transcripts_file) == 0) {
284-
t_reader.done = true;
299+
t_reader.any_data = false;
285300
return;
286301
}
287302
t_reader.open(transcripts_file);

0 commit comments

Comments
 (0)