1+ #!/usr/bin/env python
2+ # author: Jannes Spangenberg
3+ # e-mail: jannes.spangenberg@uni-jena.de
4+ # github: https://github.com/JannesSP
5+ # website: https://jannessp.github.io
6+
7+ from argparse import ArgumentDefaultsHelpFormatter , ArgumentParser , Namespace
8+ import pandas as pd
9+ import json
10+
11+ def parse () -> Namespace :
12+ parser = ArgumentParser (
13+ formatter_class = ArgumentDefaultsHelpFormatter
14+ )
15+ parser .add_argument ("scores" , type = str , help = "Path to the scores txt file" )
16+ # read metrics after segmentation
17+ parser .add_argument ("dynamont" , type = str , help = "Path to the reads metrics json file" )
18+ parser .add_argument ("uncalled4" , type = str , help = "Path to the reads metrics json file" )
19+ parser .add_argument ("f5c_eventalign" , type = str , help = "Path to the reads metrics json file" )
20+ parser .add_argument ("f5c_resquiggle" , type = str , help = "Path to the reads metrics json file" )
21+ parser .add_argument ("control" , type = str , help = "Path to the reads metrics json file" )
22+ parser .add_argument ("outfile" , type = str , help = "Path to the output file" )
23+ parser .add_argument ("time_dynamont" , type = str , help = "Path to the tools time file" )
24+ parser .add_argument ("time_uncalled4" , type = str , help = "Path to the tools time file" )
25+ parser .add_argument ("time_f5c_eventalign" , type = str , help = "Path to the tools time file" )
26+ parser .add_argument ("time_f5c_resquiggle" , type = str , help = "Path to the tools time file" )
27+ parser .add_argument ("subtools_dorado" , type = str , help = "Path to the downstream tool metrics file" )
28+ parser .add_argument ("subtools_dynamont" , type = str , help = "Path to the downstream tool metrics file" )
29+ parser .add_argument ("subtools_uncalled4" , type = str , help = "Path to the downstream tool metrics file" )
30+ parser .add_argument ("subtools_f5c_eventalign" , type = str , help = "Path to the downstream tool metrics file" )
31+ parser .add_argument ("subtools_f5c_resquiggle" , type = str , help = "Path to the downstream tool metrics file" )
32+ parser .add_argument ("--tombo" , type = str , default = None , help = "Path to the reads metrics json file" )
33+ parser .add_argument ("--time_tombo" , type = str , default = None , help = "Path to the tools time file" )
34+ parser .add_argument ("--subtools_tombo" , type = str , default = None , help = "Path to the downstream tool metrics time file" )
35+ return parser .parse_args ()
36+
37+ def main () -> None :
38+ args = parse ()
39+ scores = pd .read_csv (args .scores , sep = "\t " )
40+ scores .rename (columns = {"Median Score" : "Value" , "Segment Quality" : "Metric" }, inplace = True )
41+
42+ jsons = {
43+ "dynamont" : args .dynamont ,
44+ "uncalled4" : args .uncalled4 ,
45+ "f5c_eventalign" : args .f5c_eventalign ,
46+ "f5c_resquiggle" : args .f5c_resquiggle
47+ }
48+
49+ times = {
50+ "dynamont" : args .time_dynamont ,
51+ "uncalled4" : args .time_uncalled4 ,
52+ "f5c_eventalign" : args .time_f5c_eventalign ,
53+ "f5c_resquiggle" : args .time_f5c_resquiggle
54+ }
55+
56+ downstream_tools = {
57+ "dorado" : args .subtools_dorado ,
58+ "dynamont" : args .subtools_dynamont ,
59+ "uncalled4" : args .subtools_uncalled4 ,
60+ "f5c_eventalign" : args .subtools_f5c_eventalign ,
61+ "f5c_resquiggle" : args .subtools_f5c_resquiggle
62+ }
63+
64+ # NA rna004
65+ if args .tombo and args .tombo != '' :
66+ jsons ["tombo" ] = args .tombo
67+
68+ if args .time_tombo and args .time_tombo != '' :
69+ times ["tombo" ] = args .time_tombo
70+
71+ if args .subtools_tombo and args .subtools_tombo != '' :
72+ downstream_tools ["tombo" ] = args .subtools_tombo
73+
74+ for name , json_path in jsons .items ():
75+ with open (json_path , "r" ) as json_file :
76+ json_data = json .load (json_file )
77+ for metric , value in json_data .items ():
78+ if metric == "lengths" :
79+ continue
80+ # scores.loc[name, metric] = value
81+ new_entry = pd .DataFrame ({
82+ "Tool" : [name ],
83+ "Value" : [value ],
84+ "Metric" : [metric .lower ().replace ('n50' , 'n50_length' )]
85+ })
86+ scores = pd .concat ([scores , new_entry ], ignore_index = True )
87+
88+ for name , time_path in times .items ():
89+ with open (time_path , "r" ) as time_file :
90+ time = time_file .readline ()[14 :22 ]
91+ memory = time_file .readline ().strip ()[13 :].split (" MB" )[0 ]
92+ new_entry = pd .DataFrame ({
93+ "Tool" : [name , name ],
94+ "Value" : [time , memory ],
95+ "Metric" : ["Time in hh:mm:ss" , "Memory in MB" ]
96+ })
97+ scores = pd .concat ([scores , new_entry ], ignore_index = True )
98+
99+ for name , downstream_path in downstream_tools .items ():
100+ with open (downstream_path , "r" ) as downstream_file :
101+ total_assembly_length = int (downstream_file .readline ().strip ().split (': ' )[1 ])
102+ n50 = int (downstream_file .readline ().strip ().split (': ' )[1 ])
103+ mean_cov = float (downstream_file .readline ().strip ().split (': ' )[1 ])
104+ struct_vars = int (downstream_file .readline ().strip ().split (': ' )[1 ])
105+
106+ new_entry = pd .DataFrame ({
107+ "Tool" : [name , name , name , name ],
108+ "Value" : [total_assembly_length , n50 , mean_cov , struct_vars ],
109+ "Metric" : ["flye total length" , "flye n50" , "flye mean coverage" , "SVIM structural variants" ]
110+ })
111+ scores = pd .concat ([scores , new_entry ], ignore_index = True )
112+
113+ #! add default control values to dorado
114+ control = pd .read_csv (args .control , sep = "\t " )
115+ for _ , row in control .iterrows ():
116+ new_entry = pd .DataFrame ({
117+ "Tool" : ["Dorado" ],
118+ "Value" : [row ["Value" ]],
119+ "Metric" : [row ["Metric" ].lower () + '_length' ]
120+ })
121+ scores = pd .concat ([scores , new_entry ], ignore_index = True )
122+
123+ # print(scores.loc[scores["Metric"] == "total", "Value"].values)
124+ total_reads = scores .loc [scores ["Metric" ] == "total" , "Value" ].values [0 ]
125+ new_entry = pd .DataFrame ({
126+ "Tool" : ["Dorado" , "Dorado" , "Dorado" , "Dorado" , "Dorado" , "Dorado" ],
127+ "Metric" : ["total" , "present" , "missing" , "truncated" , "identical" , "nt changed" ],
128+ "Value" : [total_reads , total_reads , 0 , 0 , total_reads , 0 ],
129+ })
130+ scores = pd .concat ([scores , new_entry ], ignore_index = True )
131+
132+ #! remove controls and dorado
133+ scores = scores [scores ["Tool" ] != "Control Random" ]
134+ scores = scores [scores ["Tool" ] != "Control Uniform" ]
135+ # scores = scores[scores["Tool"] != "Dorado"]
136+
137+ # fix names
138+ scores ["Tool" ] = scores ["Tool" ].replace (
139+ {
140+ "dynamont" : "Dynamont" ,
141+ "Dynamont NT" : "Dynamont" ,
142+ "f5c_eventalign" : "f5c Eventalign" ,
143+ "f5c_resquiggle" : "f5c Resquiggle" ,
144+ "uncalled4" : "Uncalled4" ,
145+ "tombo" : "Tombo" ,
146+ "dorado" : "Dorado" ,
147+ }
148+ )
149+
150+ # Remove unwanted metrics
151+ # Exclude specific metrics (e.g., "Time in hh:mm:ss") from the Metric Score calculation
152+ excluded_metrics = ["missing reads" , "identical reads" , "Time in hh:mm:ss" , "Memory in MB" ]
153+ numeric_scores = scores [~ scores ["Metric" ].isin (excluded_metrics )]
154+ numeric_scores ["Value" ] = pd .to_numeric (numeric_scores ["Value" ], errors = "coerce" )
155+
156+ # Calculate Metric Score only for numeric values
157+ scores ["Metric Score" ] = numeric_scores .groupby ("Metric" )["Value" ].transform (
158+ lambda x : x / x .max () if x .max () > 0 else 0
159+ )
160+ # print("GROUP: ", scores["Metric Score"])
161+ # exit(1)
162+
163+ # Fill non-numeric rows with NaN for "Metric Score"
164+ scores ["Metric Score" ] = scores ["Metric Score" ].fillna (0 )
165+
166+ # calculate metric score
167+ # scores["Metric Score"] = scores.groupby("Metric")["Value"].transform(lambda x: x / x.max() if x.max() > 0 else 0)
168+
169+ # Adjust Metric Score for specific metrics
170+ scores .loc [scores ["Metric" ].isin (["Homogeneity" , "missing" , "truncated" , "nt_changed" , "min_length" ]), "Metric Score" ] = 1 - scores ["Metric Score" ]
171+
172+ # Calculate Metric Score only for numeric values
173+ # def metric_score(series, lower_is_better=False):
174+ # if series.max() == series.min():
175+ # return pd.Series([1.0] * len(series), index=series.index)
176+ # if lower_is_better:
177+ # return (series.max() - series) / (series.max() - series.min())
178+ # else:
179+ # return (series - series.min()) / (series.max() - series.min())
180+
181+ # # Define which metrics are "lower is better"
182+ # lower_is_better_metrics = ["Homogeneity", "missing", "truncated", "nt_changed", "min_length"]
183+
184+ # # Calculate scores for each metric
185+ # scores["Metric Score"] = 0.0
186+ # for metric in numeric_scores["Metric"].unique():
187+ # mask = scores["Metric"] == metric
188+ # lower_is_better = metric in lower_is_better_metrics
189+ # values = pd.to_numeric(scores.loc[mask, "Value"], errors="coerce")
190+ # scores.loc[mask, "Metric Score"] = metric_score(values, lower_is_better=lower_is_better)
191+
192+ # # Fill non-numeric rows with NaN for "Metric Score"
193+ # scores["Metric Score"] = scores["Metric Score"].fillna(0)
194+
195+ # Finalize the DataFrame
196+ scores = scores [["Tool" , "Metric" , "Value" , "Metric Score" ]]
197+ scores = scores .sort_values (by = ["Metric" , "Tool" ])
198+ scores .reset_index (drop = True , inplace = True )
199+
200+ print ("\n Writing to" , args .outfile , "\n " )
201+ scores .to_csv (args .outfile , sep = "\t " , index = False )
202+
203+ if __name__ == '__main__' :
204+ main ()
0 commit comments