1+ #!/usr/bin/env python
2+ # author: Jannes Spangenberg
3+ # e-mail: jannes.spangenberg@uni-jena.de
4+ # github: https://github.com/JannesSP
5+ # website: https://jannessp.github.io
6+
7+ from argparse import ArgumentDefaultsHelpFormatter , ArgumentParser , Namespace
8+ import matplotlib .pyplot as plt
9+ import read5_ont as r5
10+ from os .path import splitext , join
11+ import numpy as np
12+ from matplotlib .patches import Rectangle
13+
14+ def parse () -> Namespace :
15+ parser = ArgumentParser (
16+ formatter_class = ArgumentDefaultsHelpFormatter
17+ )
18+ parser .add_argument ("--dorado" , type = str , required = True , metavar = "TSV" , help = "Dorado segmentation output in TSV format" )
19+ parser .add_argument ("--f5cresquiggle" , type = str , required = True , metavar = "TSV" , help = "f5c resquiggle segmentation output in TSV format" )
20+ parser .add_argument ("--f5ceventalign" , type = str , required = True , metavar = "TSV" , help = "f5c eventalign segmentation output in TSV format\n Summary file must exists in the same path with extension .sum" )
21+ parser .add_argument ("--uncalled4" , type = str , metavar = "TSV" , required = True , help = "Uncalled4 segmentation output in csv format" )
22+ parser .add_argument ("--dynamont" , type = str , required = True , metavar = "CSV" , help = "Dynamont segmentation output in CSV format" )
23+ # parser.add_argument("--tombo", type=str, metavar="PATH", help="Parent directory of single fast5s processed by Tombo")
24+ parser .add_argument ("--basecalls" , type = str , required = True , metavar = "BAM" , help = "Basecalls of ONT training data as .bam file" )
25+ parser .add_argument ("--pod5" , type = str , required = True , metavar = "POD5" , help = "raw signals" )
26+ parser .add_argument ("-k" , type = int , required = True , metavar = "INT" , help = "kmer length" )
27+ parser .add_argument ("-o" , "--out" , type = str , required = True , metavar = "PATH" , help = "Output path" )
28+ parser .add_argument ("--readid" , type = str , required = True , metavar = "ID" , help = "Read ID to plot" )
29+ return parser .parse_args ()
30+
31+ def readDynamont (file : str , readid : str ) -> list :
32+ print ("Reading Dynamont output from " + file )
33+ segments = []
34+ with open (file , 'r' ) as f :
35+ next (f ) # skip header
36+ # readid,signalid,start,end,basepos,base,motif,state,posterior_probability,polish
37+ for line in f :
38+ if not line .startswith (readid ):
39+ continue
40+ try :
41+ _ , _ , start , end , _ , base , motif , * _ = line .strip ().split (',' )
42+ except ValueError : # empty line in file
43+ pass
44+ segments .append ((int (start ), int (end ), base , motif ))
45+ return segments
46+
47+ def readUncalled4 (file : str , readid : str , kmerSize : int = 5 ) -> list :
48+ print ("Reading Uncalled4 output from " + file )
49+ segments = []
50+ HALFKMERSIZE = kmerSize // 2
51+ with open (file , 'r' ) as f :
52+ next (f ) # skip header
53+ # seq.name seq.pos seq.strand aln.id seq.kmer aln.read_id dtw.start dtw.length dtw.current dtw.current_sd
54+ for line in f :
55+ _ , _ , _ , _ , kmer , rid , start , length , * _ = line .strip ().split ('\t ' )
56+ if not rid == readid :
57+ continue
58+ if start == "*" :
59+ continue
60+ segments .append ((int (start ), int (start ) + int (length ), kmer [HALFKMERSIZE ], kmer ))
61+ return segments
62+
63+ def readF5CResquiggle (file : str , readid : str , read : str , kmerSize : int = 5 ) -> list :
64+ print ("Reading f5c Resquiggle output from " + file )
65+ segments = []
66+ HALFKMERSIZE = kmerSize // 2
67+ with open (file , 'r' ) as f :
68+ next (f ) # skip header
69+ # contig position reference_kmer read_index strand event_index event_level_mean event_stdv event_length model_kmer model_mean model_stdv standardized_level start_idx end_idx
70+ for line in f :
71+ if not line .startswith (readid ):
72+ continue
73+ _ , kmer_idx , start , end = line .strip ().split ('\t ' )
74+ if start == '.' or end == '.' :
75+ continue
76+ segments .append ((int (start ), int (end ), read [int (kmer_idx ) + HALFKMERSIZE ], read [int (kmer_idx ) + kmerSize ]))
77+ return segments
78+
79+ def readF5CEventalign (file : str , summary : str , readid : str , kmerSize : int = 5 ) -> list :
80+ print ("Reading f5c Eventalign output from " + file )
81+ readNum = None
82+ HALFKMERSIZE = kmerSize // 2
83+ with open (summary , 'r' ) as f :
84+ # read_index read_name fast5_path model_name strand num_events num_steps num_skips num_stays total_duration shift scale drift var
85+ for line in f :
86+ rid , readName , * _ = line .strip ().split ('\t ' )
87+ if not readName == readid :
88+ continue
89+ readNum = rid
90+ break
91+
92+ segments = []
93+ with open (file , 'r' ) as f :
94+ next (f ) # skip header
95+ # contig position reference_kmer read_index strand event_index event_level_mean event_stdv event_length model_kmer model_mean model_stdv standardized_level start_idx end_idx
96+ for line in f :
97+ _ , _ , _ , rNum , _ , _ , _ , _ , _ , modelKmer , _ , _ , _ , start , end = line .strip ().split ('\t ' )
98+ if not rNum == readNum :
99+ continue
100+ segments .append ((int (start ), int (end ), modelKmer [HALFKMERSIZE ], modelKmer ))
101+ return segments
102+
103+ def readDorado (file : str , readid : str ) -> list :
104+ print ("Reading Dorado output from " + file )
105+ segments = []
106+ with open (file , 'r' ) as f :
107+ next (f ) # skip header
108+ # readid position base motif start end
109+ for line in f :
110+ if not line .startswith (readid ):
111+ continue
112+ rid , signalid , _ , base , motif , start , end = line .strip ().split ('\t ' )
113+ segments .append ((int (start ), int (end ), base , motif ))
114+ return segments
115+
116+ def readBasecalls (file : str , readid : str ) -> str :
117+ import pysam
118+ with pysam .AlignmentFile (file , "rb" , check_sq = False ) as bamfile :
119+ for read in bamfile .fetch (until_eof = True ):
120+ if not readid == read .query_name :
121+ continue
122+ return read .query_sequence
123+
124+ def main () -> None :
125+ basecolors = {
126+ 'A' :'#377eb8' ,
127+ 'a' :'#377eb8' ,
128+ 'C' :'#ff7f00' ,
129+ 'c' :'#ff7f00' ,
130+ 'G' :'#4daf4a' ,
131+ 'g' :'#4daf4a' ,
132+ 'T' :'#f781bf' ,
133+ 't' :'#f781bf' ,
134+ 'U' :'#f781bf' ,
135+ 'u' :'#f781bf' ,
136+ 'N' :'#ffffff' ,
137+ 'n' :'#ffffff' ,
138+ }
139+
140+ args = parse ()
141+
142+ read = readBasecalls (args .basecalls , args .readid )
143+
144+ tools = {
145+ "Dynamont" : np .array (readDynamont (args .dynamont , args .readid )),
146+ "Uncalled4" : np .array (readUncalled4 (args .uncalled4 , args .readid )),
147+ "f5c Resquiggle" : np .array (readF5CResquiggle (args .f5cresquiggle , args .readid , read , args .k // 2 )),
148+ "f5c Eventalign" : np .array (readF5CEventalign (args .f5ceventalign , splitext (args .f5ceventalign )[0 ] + '.sum' , args .readid )),
149+ # "Tombo" : readTombo(args.tombo, args.readid),
150+ "Dorado" : np .array (readDorado (args .dorado , args .readid )),
151+ }
152+
153+ outpath = args .out
154+ signal = r5 .read (args .pod5 ).getpASignal (args .readid )
155+
156+ fig , ax = plt .subplots (nrows = 5 , figsize = (110 ,15 ), dpi = 300 )
157+ fig .suptitle ("Segmentation of the same read by different tools" )
158+ fig .supylabel ("Current (pA)" )
159+ fig .supxlabel ("Rel. Time (Sequencing Data Points)" )
160+
161+ for i , (tool , segments ) in enumerate (tools .items ()):
162+ segments = np .unique (segments , axis = 1 )
163+ # segments = segments[np.argsort(segments[:, 0].astype(int))]
164+ ax [i ].plot (signal , linewidth = 1.0 , c = 'black' )
165+ ax [i ].set_xlim ((0 , len (signal )))
166+ # ax[i].vlines(segments[:, 0], ymin=min(signal), ymax=max(signal), linewidth=0.5, linestyle='--', colors='red')
167+ ax [i ].set_title (tool )
168+ # ax[i].legend()
169+ ax [i ].set_xticks (np .arange (0 , len (signal ), 1000 ))
170+
171+ for s in segments :
172+ start = int (s [0 ])
173+ end = int (s [1 ])
174+ base = str (s [2 ])
175+ # ax[i].text(start + (end - start)/2 - 6, -3.5, base, fontdict={'size' : 6, 'color':'black'}) # rotation=90
176+ ax [i ].vlines ([start , end ], ymin = min (signal ), ymax = max (signal ), colors = basecolors [base ], linestyles = '--' , linewidth = 0.7 )
177+ ax [i ].add_patch (Rectangle ((start , min (signal )), end - start , max (signal )- min (signal ), alpha = 0.4 , edgecolor = basecolors [base ], facecolor = basecolors [base ]))
178+
179+ plt .tight_layout ()
180+ plt .savefig (join (outpath , args .readid + "_tool_segmentation.svg" ), dpi = 300 )
181+ plt .savefig (join (outpath , args .readid + "_tool_segmentation.pdf" ), dpi = 300 )
182+ plt .savefig (join (outpath , args .readid + "_tool_segmentation.png" ), dpi = 300 )
183+ plt .close ()
184+
185+ # fig, ax = plt.subplots(nrows = 5, figsize=(20,10), dpi=300)
186+ # fig.suptitle(f"Segmentation of {args.readid} by Different Tools", fontsize=20)
187+ # fig.supylabel("Current (pA)", fontsize=18)
188+ # fig.supxlabel("Rel. Time (Sequencing Data Points)", fontsize=18)
189+
190+ # for i, (tool, segments) in enumerate(tools.items()):
191+ # segments = np.unique(segments, axis=1)
192+ # # segments = segments[np.argsort(segments[:, 0].astype(int))]
193+ # ax[i].plot(signal, linewidth=1.0, c='black')
194+ # # ax[i].vlines(segments[:, 0], ymin=min(signal), ymax=max(signal), linewidth=0.5, linestyle='--', colors='red')
195+ # ax[i].set_title(tool, fontsize=18)
196+ # # ax[i].legend()
197+
198+ # for s in segments:
199+ # start = int(s[0])
200+ # end = int(s[1])
201+ # base = str(s[2])
202+ # # ax[i].text(start + (end - start)/2 - 6, -3.5, base, fontdict={'size' : 6, 'color':'black'}) # rotation=90
203+ # ax[i].vlines([start], ymin=min(signal), ymax=max(signal), colors=basecolors[base], linestyles='--', linewidth=0.7)
204+ # ax[i].add_patch(Rectangle((start, min(signal)), end - start, max(signal)-min(signal), alpha=0.4, edgecolor=basecolors[base], facecolor=basecolors[base]))
205+
206+ # ax[i].set_xticks(np.arange(0, len(signal), 500))
207+ # ax[i].set_xlim((19000, 22000))
208+
209+ # plt.tight_layout()
210+ # plt.savefig(join(outpath, args.readid + "_tool_segmentation_region.svg"), dpi=300)
211+
212+ if __name__ == '__main__' :
213+ main ()
0 commit comments