Skip to content

Commit 8362a32

Browse files
committed
add plot for all tools combined
1 parent 6c184b4 commit 8362a32

File tree

1 file changed

+213
-0
lines changed

1 file changed

+213
-0
lines changed
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
#!/usr/bin/env python
2+
# author: Jannes Spangenberg
3+
# e-mail: jannes.spangenberg@uni-jena.de
4+
# github: https://github.com/JannesSP
5+
# website: https://jannessp.github.io
6+
7+
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, Namespace
8+
import matplotlib.pyplot as plt
9+
import read5_ont as r5
10+
from os.path import splitext, join
11+
import numpy as np
12+
from matplotlib.patches import Rectangle
13+
14+
def parse() -> Namespace:
15+
parser = ArgumentParser(
16+
formatter_class=ArgumentDefaultsHelpFormatter
17+
)
18+
parser.add_argument("--dorado", type=str, required=True, metavar="TSV", help="Dorado segmentation output in TSV format")
19+
parser.add_argument("--f5cresquiggle", type=str, required=True, metavar="TSV", help="f5c resquiggle segmentation output in TSV format")
20+
parser.add_argument("--f5ceventalign", type=str, required=True, metavar="TSV", help="f5c eventalign segmentation output in TSV format\nSummary file must exists in the same path with extension .sum")
21+
parser.add_argument("--uncalled4", type=str, metavar="TSV", required=True, help="Uncalled4 segmentation output in csv format")
22+
parser.add_argument("--dynamont", type=str, required=True, metavar="CSV", help="Dynamont segmentation output in CSV format")
23+
# parser.add_argument("--tombo", type=str, metavar="PATH", help="Parent directory of single fast5s processed by Tombo")
24+
parser.add_argument("--basecalls", type=str, required=True, metavar="BAM", help="Basecalls of ONT training data as .bam file")
25+
parser.add_argument("--pod5", type=str, required=True, metavar="POD5", help="raw signals")
26+
parser.add_argument("-k", type=int, required=True, metavar="INT", help="kmer length")
27+
parser.add_argument("-o", "--out", type=str, required=True, metavar="PATH", help="Output path")
28+
parser.add_argument("--readid", type=str, required=True, metavar="ID", help="Read ID to plot")
29+
return parser.parse_args()
30+
31+
def readDynamont(file: str, readid : str) -> list:
32+
print("Reading Dynamont output from " + file)
33+
segments = []
34+
with open(file, 'r') as f:
35+
next(f) # skip header
36+
# readid,signalid,start,end,basepos,base,motif,state,posterior_probability,polish
37+
for line in f:
38+
if not line.startswith(readid):
39+
continue
40+
try:
41+
_, _, start, end, _, base, motif, *_ = line.strip().split(',')
42+
except ValueError: # empty line in file
43+
pass
44+
segments.append((int(start), int(end), base, motif))
45+
return segments
46+
47+
def readUncalled4(file : str, readid : str, kmerSize : int = 5) -> list:
48+
print("Reading Uncalled4 output from " + file)
49+
segments = []
50+
HALFKMERSIZE = kmerSize // 2
51+
with open(file, 'r') as f:
52+
next(f) # skip header
53+
# seq.name seq.pos seq.strand aln.id seq.kmer aln.read_id dtw.start dtw.length dtw.current dtw.current_sd
54+
for line in f:
55+
_, _, _, _, kmer, rid, start, length, *_ = line.strip().split('\t')
56+
if not rid == readid:
57+
continue
58+
if start == "*":
59+
continue
60+
segments.append((int(start), int(start) + int(length), kmer[HALFKMERSIZE], kmer))
61+
return segments
62+
63+
def readF5CResquiggle(file : str, readid : str, read : str, kmerSize : int = 5) -> list:
64+
print("Reading f5c Resquiggle output from " + file)
65+
segments = []
66+
HALFKMERSIZE = kmerSize // 2
67+
with open(file, 'r') as f:
68+
next(f) # skip header
69+
# contig position reference_kmer read_index strand event_index event_level_mean event_stdv event_length model_kmer model_mean model_stdv standardized_level start_idx end_idx
70+
for line in f:
71+
if not line.startswith(readid):
72+
continue
73+
_, kmer_idx, start, end = line.strip().split('\t')
74+
if start == '.' or end == '.':
75+
continue
76+
segments.append((int(start), int(end), read[int(kmer_idx) + HALFKMERSIZE], read[int(kmer_idx) + kmerSize]))
77+
return segments
78+
79+
def readF5CEventalign(file: str, summary : str, readid : str, kmerSize : int = 5) -> list:
80+
print("Reading f5c Eventalign output from " + file)
81+
readNum = None
82+
HALFKMERSIZE = kmerSize // 2
83+
with open(summary, 'r') as f:
84+
# read_index read_name fast5_path model_name strand num_events num_steps num_skips num_stays total_duration shift scale drift var
85+
for line in f:
86+
rid, readName, *_ = line.strip().split('\t')
87+
if not readName == readid:
88+
continue
89+
readNum = rid
90+
break
91+
92+
segments = []
93+
with open(file, 'r') as f:
94+
next(f) # skip header
95+
# contig position reference_kmer read_index strand event_index event_level_mean event_stdv event_length model_kmer model_mean model_stdv standardized_level start_idx end_idx
96+
for line in f:
97+
_, _, _, rNum, _, _, _, _, _, modelKmer, _, _, _, start, end = line.strip().split('\t')
98+
if not rNum == readNum:
99+
continue
100+
segments.append((int(start), int(end), modelKmer[HALFKMERSIZE], modelKmer))
101+
return segments
102+
103+
def readDorado(file : str, readid : str) -> list:
104+
print("Reading Dorado output from " + file)
105+
segments = []
106+
with open(file, 'r') as f:
107+
next(f) # skip header
108+
# readid position base motif start end
109+
for line in f:
110+
if not line.startswith(readid):
111+
continue
112+
rid, signalid, _, base, motif, start, end = line.strip().split('\t')
113+
segments.append((int(start), int(end), base, motif))
114+
return segments
115+
116+
def readBasecalls(file : str, readid : str) -> str:
117+
import pysam
118+
with pysam.AlignmentFile(file, "rb", check_sq=False) as bamfile:
119+
for read in bamfile.fetch(until_eof=True):
120+
if not readid == read.query_name:
121+
continue
122+
return read.query_sequence
123+
124+
def main() -> None:
125+
basecolors = {
126+
'A':'#377eb8',
127+
'a':'#377eb8',
128+
'C':'#ff7f00',
129+
'c':'#ff7f00',
130+
'G':'#4daf4a',
131+
'g':'#4daf4a',
132+
'T':'#f781bf',
133+
't':'#f781bf',
134+
'U':'#f781bf',
135+
'u':'#f781bf',
136+
'N':'#ffffff',
137+
'n':'#ffffff',
138+
}
139+
140+
args = parse()
141+
142+
read = readBasecalls(args.basecalls, args.readid)
143+
144+
tools = {
145+
"Dynamont" : np.array(readDynamont(args.dynamont, args.readid)),
146+
"Uncalled4" : np.array(readUncalled4(args.uncalled4, args.readid)),
147+
"f5c Resquiggle" : np.array(readF5CResquiggle(args.f5cresquiggle, args.readid, read, args.k // 2)),
148+
"f5c Eventalign" : np.array(readF5CEventalign(args.f5ceventalign, splitext(args.f5ceventalign)[0] + '.sum', args.readid)),
149+
# "Tombo" : readTombo(args.tombo, args.readid),
150+
"Dorado" : np.array(readDorado(args.dorado, args.readid)),
151+
}
152+
153+
outpath = args.out
154+
signal = r5.read(args.pod5).getpASignal(args.readid)
155+
156+
fig, ax = plt.subplots(nrows = 5, figsize=(110,15), dpi=300)
157+
fig.suptitle("Segmentation of the same read by different tools")
158+
fig.supylabel("Current (pA)")
159+
fig.supxlabel("Rel. Time (Sequencing Data Points)")
160+
161+
for i, (tool, segments) in enumerate(tools.items()):
162+
segments = np.unique(segments, axis=1)
163+
# segments = segments[np.argsort(segments[:, 0].astype(int))]
164+
ax[i].plot(signal, linewidth=1.0, c='black')
165+
ax[i].set_xlim((0, len(signal)))
166+
# ax[i].vlines(segments[:, 0], ymin=min(signal), ymax=max(signal), linewidth=0.5, linestyle='--', colors='red')
167+
ax[i].set_title(tool)
168+
# ax[i].legend()
169+
ax[i].set_xticks(np.arange(0, len(signal), 1000))
170+
171+
for s in segments:
172+
start = int(s[0])
173+
end = int(s[1])
174+
base = str(s[2])
175+
# ax[i].text(start + (end - start)/2 - 6, -3.5, base, fontdict={'size' : 6, 'color':'black'}) # rotation=90
176+
ax[i].vlines([start, end], ymin=min(signal), ymax=max(signal), colors=basecolors[base], linestyles='--', linewidth=0.7)
177+
ax[i].add_patch(Rectangle((start, min(signal)), end - start, max(signal)-min(signal), alpha=0.4, edgecolor=basecolors[base], facecolor=basecolors[base]))
178+
179+
plt.tight_layout()
180+
plt.savefig(join(outpath, args.readid + "_tool_segmentation.svg"), dpi=300)
181+
plt.savefig(join(outpath, args.readid + "_tool_segmentation.pdf"), dpi=300)
182+
plt.savefig(join(outpath, args.readid + "_tool_segmentation.png"), dpi=300)
183+
plt.close()
184+
185+
# fig, ax = plt.subplots(nrows = 5, figsize=(20,10), dpi=300)
186+
# fig.suptitle(f"Segmentation of {args.readid} by Different Tools", fontsize=20)
187+
# fig.supylabel("Current (pA)", fontsize=18)
188+
# fig.supxlabel("Rel. Time (Sequencing Data Points)", fontsize=18)
189+
190+
# for i, (tool, segments) in enumerate(tools.items()):
191+
# segments = np.unique(segments, axis=1)
192+
# # segments = segments[np.argsort(segments[:, 0].astype(int))]
193+
# ax[i].plot(signal, linewidth=1.0, c='black')
194+
# # ax[i].vlines(segments[:, 0], ymin=min(signal), ymax=max(signal), linewidth=0.5, linestyle='--', colors='red')
195+
# ax[i].set_title(tool, fontsize=18)
196+
# # ax[i].legend()
197+
198+
# for s in segments:
199+
# start = int(s[0])
200+
# end = int(s[1])
201+
# base = str(s[2])
202+
# # ax[i].text(start + (end - start)/2 - 6, -3.5, base, fontdict={'size' : 6, 'color':'black'}) # rotation=90
203+
# ax[i].vlines([start], ymin=min(signal), ymax=max(signal), colors=basecolors[base], linestyles='--', linewidth=0.7)
204+
# ax[i].add_patch(Rectangle((start, min(signal)), end - start, max(signal)-min(signal), alpha=0.4, edgecolor=basecolors[base], facecolor=basecolors[base]))
205+
206+
# ax[i].set_xticks(np.arange(0, len(signal), 500))
207+
# ax[i].set_xlim((19000, 22000))
208+
209+
# plt.tight_layout()
210+
# plt.savefig(join(outpath, args.readid + "_tool_segmentation_region.svg"), dpi=300)
211+
212+
if __name__ == '__main__':
213+
main()

0 commit comments

Comments
 (0)