Skip to content

Commit 7935303

Browse files
authored
Merge pull request #5 from rnajena/dev
prepare 0.7.2
2 parents 2cf9525 + 3a99b57 commit 7935303

21 files changed

+1563
-199
lines changed

CMakeLists.txt

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,14 @@ include_directories(${CMAKE_BINARY_DIR})
5050

5151

5252
# Create the first executable dynamont-NT
53-
# add_library(NT_library src/cpp/utils.cpp src/cpp/NT.cpp)
54-
# add_executable(dynamont-NT src/cpp/NT_main.cpp)
55-
# target_link_libraries(dynamont-NT PRIVATE NT_library)
56-
# if(OpenMP_CXX_FOUND)
57-
# target_link_libraries(dynamont-NT PUBLIC OpenMP::OpenMP_CXX)
58-
# endif()
53+
add_library(NT_library src/cpp/utils.cpp src/cpp/NT.cpp)
54+
add_executable(dynamont-NT src/cpp/NT_main.cpp)
55+
target_link_libraries(dynamont-NT PRIVATE NT_library)
56+
if(OpenMP_CXX_FOUND)
57+
target_link_libraries(dynamont-NT PUBLIC OpenMP::OpenMP_CXX)
58+
endif()
5959
# Include header files for NT_library
60-
# target_include_directories(dynamont-NT PRIVATE include)
60+
target_include_directories(dynamont-NT PRIVATE include)
6161

6262
# Create the second executable dynamont-NTC
6363
add_library(NTC_library src/cpp/utils.cpp src/cpp/NTC.cpp)
@@ -101,7 +101,7 @@ target_include_directories(test_dynamont PRIVATE include)
101101
add_test(NAME DynamontTests COMMAND test_dynamont)
102102

103103
# Specify installation location
104-
# install(TARGETS dynamont-NT DESTINATION bin)
105104
install(TARGETS dynamont-NTC DESTINATION bin)
106105
install(TARGETS dynamont-NT-banded DESTINATION bin)
106+
install(TARGETS dynamont-NT DESTINATION bin)
107107
install(TARGETS test_dynamont DESTINATION bin)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ dynamont-train -r <path/to/pod5/dataset/> -b <basecalls.bam> --mode basic -o <ou
5959

6060
- [rna_r9](models/rna/r9.4.1/rna002_5mer.model) (tested)
6161
- [rna_rp4](models/rna/rp4/rna004_9mer.model) (tested)
62-
- dna_r9 not available yet
62+
- dna_r9 not available
6363
- [dna_r10.4.1 260 bps](models/dna/r10.4.1/dna_r10.4.1_e8.2_260bps.model) (not tested)
6464
- [dna_r10.4.1 400 bps](models/dna/r10.4.1/dna_r10.4.1_e8.2_400bps.model) (tested)
6565

build.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,12 @@ cd ..
2929
mkdir -p src/bin
3030
cp $BUILD_DIR/bin/dynamont-NTC $BIN_DIR
3131
cp $BUILD_DIR/bin/dynamont-NT-banded $BIN_DIR
32-
strip "$BIN_DIR/dynamont-NTC" "$BIN_DIR/dynamont-NT-banded"
32+
cp $BUILD_DIR/bin/dynamont-NT $BIN_DIR
33+
strip "$BIN_DIR/dynamont-NTC" "$BIN_DIR/dynamont-NT-banded" "$BIN_DIR/dynamont-NT"
3334

3435
# Build Python package (wheel + sdist)
3536
echo "[INFO] Building Python package..."
36-
# python -m pip install --user --upgrade setuptools wheel build --ignore-installed -v .
37+
python -m pip install --user --upgrade setuptools wheel build --ignore-installed -v .
3738
python -m build
3839
rm -rf $BUILD_DIR
3940
# twine upload dist/*

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ test = [
4444
dynamont-resquiggle = "python.segmentation.segment:main"
4545
dynamont-train = "python.segmentation.train:main"
4646
dynamont-NTC = "python.ntc_entry:main"
47-
dynamont-NT-banded = "python.nt_entry:main"
47+
dynamont-NT-banded = "python.nt_entry_banded:main"
48+
dynamont-NT = "python.nt_entry:main"
4849

4950
[project.urls]
5051
repository = "https://github.com/rnajena/dynamont"
@@ -77,6 +78,7 @@ packages = ["src/python"]
7778
"models/rna/rp4/rna004_9mer.model" = "models/rna/rp4/rna004_9mer.model"
7879
"src/bin/dynamont-NTC" = "src/bin/dynamont-NTC"
7980
"src/bin/dynamont-NT-banded" = "src/bin/dynamont-NT-banded"
81+
"src/bin/dynamont-NT" = "src/bin/dynamont-NT"
8082

8183
[tool.hatch.build.targets.sdist]
8284
include = [
@@ -93,6 +95,7 @@ include = [
9395
"models/rna/rp4/rna004_9mer.model" = "models/rna/rp4/rna004_9mer.model"
9496
"src/bin/dynamont-NTC" = "src/bin/dynamont-NTC"
9597
"src/bin/dynamont-NT-banded" = "src/bin/dynamont-NT-banded"
98+
"src/bin/dynamont-NT" = "src/bin/dynamont-NT"
9699

97100
[tool.hatch.version]
98101
source = "vcs"

src/python/misc/collectMetrics.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
#!/usr/bin/env python
2+
# author: Jannes Spangenberg
3+
# e-mail: jannes.spangenberg@uni-jena.de
4+
# github: https://github.com/JannesSP
5+
# website: https://jannessp.github.io
6+
7+
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, Namespace
8+
import pandas as pd
9+
import json
10+
11+
def parse() -> Namespace:
12+
parser = ArgumentParser(
13+
formatter_class=ArgumentDefaultsHelpFormatter
14+
)
15+
parser.add_argument("scores", type=str, help="Path to the scores txt file")
16+
# read metrics after segmentation
17+
parser.add_argument("dynamont", type=str, help="Path to the reads metrics json file")
18+
parser.add_argument("uncalled4", type=str, help="Path to the reads metrics json file")
19+
parser.add_argument("f5c_eventalign", type=str, help="Path to the reads metrics json file")
20+
parser.add_argument("f5c_resquiggle", type=str, help="Path to the reads metrics json file")
21+
parser.add_argument("control", type=str, help="Path to the reads metrics json file")
22+
parser.add_argument("outfile", type=str, help="Path to the output file")
23+
parser.add_argument("time_dynamont", type=str, help="Path to the tools time file")
24+
parser.add_argument("time_uncalled4", type=str, help="Path to the tools time file")
25+
parser.add_argument("time_f5c_eventalign", type=str, help="Path to the tools time file")
26+
parser.add_argument("time_f5c_resquiggle", type=str, help="Path to the tools time file")
27+
parser.add_argument("subtools_dorado", type=str, help="Path to the downstream tool metrics file")
28+
parser.add_argument("subtools_dynamont", type=str, help="Path to the downstream tool metrics file")
29+
parser.add_argument("subtools_uncalled4", type=str, help="Path to the downstream tool metrics file")
30+
parser.add_argument("subtools_f5c_eventalign", type=str, help="Path to the downstream tool metrics file")
31+
parser.add_argument("subtools_f5c_resquiggle", type=str, help="Path to the downstream tool metrics file")
32+
parser.add_argument("--tombo", type=str, default=None, help="Path to the reads metrics json file")
33+
parser.add_argument("--time_tombo", type=str, default=None, help="Path to the tools time file")
34+
parser.add_argument("--subtools_tombo", type=str, default=None, help="Path to the downstream tool metrics time file")
35+
return parser.parse_args()
36+
37+
def main() -> None:
38+
args = parse()
39+
scores = pd.read_csv(args.scores, sep="\t")
40+
scores.rename(columns={"Median Score": "Value", "Segment Quality" : "Metric"}, inplace=True)
41+
42+
jsons = {
43+
"dynamont": args.dynamont,
44+
"uncalled4": args.uncalled4,
45+
"f5c_eventalign": args.f5c_eventalign,
46+
"f5c_resquiggle": args.f5c_resquiggle
47+
}
48+
49+
times = {
50+
"dynamont": args.time_dynamont,
51+
"uncalled4": args.time_uncalled4,
52+
"f5c_eventalign": args.time_f5c_eventalign,
53+
"f5c_resquiggle": args.time_f5c_resquiggle
54+
}
55+
56+
downstream_tools = {
57+
"dorado" : args.subtools_dorado,
58+
"dynamont": args.subtools_dynamont,
59+
"uncalled4": args.subtools_uncalled4,
60+
"f5c_eventalign": args.subtools_f5c_eventalign,
61+
"f5c_resquiggle": args.subtools_f5c_resquiggle
62+
}
63+
64+
# NA rna004
65+
if args.tombo and args.tombo != '':
66+
jsons["tombo"] = args.tombo
67+
68+
if args.time_tombo and args.time_tombo != '':
69+
times["tombo"] = args.time_tombo
70+
71+
if args.subtools_tombo and args.subtools_tombo != '':
72+
downstream_tools["tombo"] = args.subtools_tombo
73+
74+
for name, json_path in jsons.items():
75+
with open(json_path, "r") as json_file:
76+
json_data = json.load(json_file)
77+
for metric, value in json_data.items():
78+
if metric == "lengths":
79+
continue
80+
# scores.loc[name, metric] = value
81+
new_entry = pd.DataFrame({
82+
"Tool": [name],
83+
"Value": [value],
84+
"Metric": [metric.lower().replace('n50', 'n50_length')]
85+
})
86+
scores = pd.concat([scores, new_entry], ignore_index=True)
87+
88+
for name, time_path in times.items():
89+
with open(time_path, "r") as time_file:
90+
time = time_file.readline()[14:22]
91+
memory = time_file.readline().strip()[13:].split(" MB")[0]
92+
new_entry = pd.DataFrame({
93+
"Tool": [name, name],
94+
"Value": [time, memory],
95+
"Metric": ["Time in hh:mm:ss", "Memory in MB"]
96+
})
97+
scores = pd.concat([scores, new_entry], ignore_index=True)
98+
99+
for name, downstream_path in downstream_tools.items():
100+
with open(downstream_path, "r") as downstream_file:
101+
total_assembly_length = int(downstream_file.readline().strip().split(': ')[1])
102+
n50 = int(downstream_file.readline().strip().split(': ')[1])
103+
mean_cov = float(downstream_file.readline().strip().split(': ')[1])
104+
struct_vars = int(downstream_file.readline().strip().split(': ')[1])
105+
106+
new_entry = pd.DataFrame({
107+
"Tool": [name, name, name, name],
108+
"Value": [total_assembly_length, n50, mean_cov, struct_vars],
109+
"Metric": ["flye total length", "flye n50", "flye mean coverage", "SVIM structural variants"]
110+
})
111+
scores = pd.concat([scores, new_entry], ignore_index=True)
112+
113+
#! add default control values to dorado
114+
control = pd.read_csv(args.control, sep="\t")
115+
for _, row in control.iterrows():
116+
new_entry = pd.DataFrame({
117+
"Tool": ["Dorado"],
118+
"Value": [row["Value"]],
119+
"Metric": [row["Metric"].lower() + '_length']
120+
})
121+
scores = pd.concat([scores, new_entry], ignore_index=True)
122+
123+
# print(scores.loc[scores["Metric"] == "total", "Value"].values)
124+
total_reads = scores.loc[scores["Metric"] == "total", "Value"].values[0]
125+
new_entry = pd.DataFrame({
126+
"Tool": ["Dorado", "Dorado", "Dorado", "Dorado", "Dorado", "Dorado"],
127+
"Metric": ["total", "present", "missing", "truncated", "identical", "nt changed"],
128+
"Value": [total_reads, total_reads, 0, 0, total_reads, 0],
129+
})
130+
scores = pd.concat([scores, new_entry], ignore_index=True)
131+
132+
#! remove controls and dorado
133+
scores = scores[scores["Tool"] != "Control Random"]
134+
scores = scores[scores["Tool"] != "Control Uniform"]
135+
# scores = scores[scores["Tool"] != "Dorado"]
136+
137+
# fix names
138+
scores["Tool"] = scores["Tool"].replace(
139+
{
140+
"dynamont": "Dynamont",
141+
"Dynamont NT": "Dynamont",
142+
"f5c_eventalign": "f5c Eventalign",
143+
"f5c_resquiggle": "f5c Resquiggle",
144+
"uncalled4": "Uncalled4",
145+
"tombo": "Tombo",
146+
"dorado": "Dorado",
147+
}
148+
)
149+
150+
# Remove unwanted metrics
151+
# Exclude specific metrics (e.g., "Time in hh:mm:ss") from the Metric Score calculation
152+
excluded_metrics = ["missing reads", "identical reads", "Time in hh:mm:ss", "Memory in MB"]
153+
numeric_scores = scores[~scores["Metric"].isin(excluded_metrics)]
154+
numeric_scores["Value"] = pd.to_numeric(numeric_scores["Value"], errors="coerce")
155+
156+
# Calculate Metric Score only for numeric values
157+
scores["Metric Score"] = numeric_scores.groupby("Metric")["Value"].transform(
158+
lambda x: x / x.max() if x.max() > 0 else 0
159+
)
160+
# print("GROUP: ", scores["Metric Score"])
161+
# exit(1)
162+
163+
# Fill non-numeric rows with NaN for "Metric Score"
164+
scores["Metric Score"] = scores["Metric Score"].fillna(0)
165+
166+
# calculate metric score
167+
# scores["Metric Score"] = scores.groupby("Metric")["Value"].transform(lambda x: x / x.max() if x.max() > 0 else 0)
168+
169+
# Adjust Metric Score for specific metrics
170+
scores.loc[scores["Metric"].isin(["Homogeneity", "missing", "truncated", "nt_changed", "min_length"]), "Metric Score"] = 1 - scores["Metric Score"]
171+
172+
# Calculate Metric Score only for numeric values
173+
# def metric_score(series, lower_is_better=False):
174+
# if series.max() == series.min():
175+
# return pd.Series([1.0] * len(series), index=series.index)
176+
# if lower_is_better:
177+
# return (series.max() - series) / (series.max() - series.min())
178+
# else:
179+
# return (series - series.min()) / (series.max() - series.min())
180+
181+
# # Define which metrics are "lower is better"
182+
# lower_is_better_metrics = ["Homogeneity", "missing", "truncated", "nt_changed", "min_length"]
183+
184+
# # Calculate scores for each metric
185+
# scores["Metric Score"] = 0.0
186+
# for metric in numeric_scores["Metric"].unique():
187+
# mask = scores["Metric"] == metric
188+
# lower_is_better = metric in lower_is_better_metrics
189+
# values = pd.to_numeric(scores.loc[mask, "Value"], errors="coerce")
190+
# scores.loc[mask, "Metric Score"] = metric_score(values, lower_is_better=lower_is_better)
191+
192+
# # Fill non-numeric rows with NaN for "Metric Score"
193+
# scores["Metric Score"] = scores["Metric Score"].fillna(0)
194+
195+
# Finalize the DataFrame
196+
scores = scores[["Tool", "Metric", "Value", "Metric Score"]]
197+
scores = scores.sort_values(by=["Metric", "Tool"])
198+
scores.reset_index(drop=True, inplace=True)
199+
200+
print("\nWriting to", args.outfile, "\n")
201+
scores.to_csv(args.outfile, sep="\t", index=False)
202+
203+
if __name__ == '__main__':
204+
main()

src/python/edgeDetection/compareDatasets.py renamed to src/python/misc/compareDatasets.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,32 @@
7171
"Read Lengths" : "/data/fass5/projects/js_dynamont/benchmark/comparison/rna004/ivt/comparison_w0_readLengths.csv",
7272
"Read Quality" : "/data/fass5/projects/js_dynamont/benchmark/comparison/rna004/ivt/comparison_w0_readQuality.csv",
7373
},
74+
},
75+
"DNA R10.4.1 5kHz" : {
76+
"H. Sapiens" : {
77+
"Score" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/h_sapiens/comparison_w0_score.txt",
78+
"Reads Segmentation Ratio" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/h_sapiens/comparison_w0_segmentedReadsRatio.csv",
79+
"Read Lengths" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/h_sapiens/comparison_w0_readLengths.csv",
80+
"Read Quality" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/h_sapiens/comparison_w0_readQuality.csv",
81+
},
82+
"Zymo hmw" : {
83+
"Score" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/zymo_hmw/comparison_w0_score.txt",
84+
"Reads Segmentation Ratio" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/zymo_hmw/comparison_w0_segmentedReadsRatio.csv",
85+
"Read Lengths" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/zymo_hmw/comparison_w0_readLengths.csv",
86+
"Read Quality" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/zymo_hmw/comparison_w0_readQuality.csv",
87+
},
88+
"S. Aureus" : {
89+
"Score" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/p_anserina/comparison_w0_score.txt",
90+
"Reads Segmentation Ratio" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/p_anserina/comparison_w0_segmentedReadsRatio.csv",
91+
"Read Lengths" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/p_anserina/comparison_w0_readLengths.csv",
92+
"Read Quality" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/p_anserina/comparison_w0_readQuality.csv",
93+
},
94+
"P. Anserina" : {
95+
"Score" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/s_aureus/comparison_w0_score.txt",
96+
"Reads Segmentation Ratio" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/s_aureus/comparison_w0_segmentedReadsRatio.csv",
97+
"Read Lengths" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/s_aureus/comparison_w0_readLengths.csv",
98+
"Read Quality" : "/data/fass5/projects/js_dynamont/benchmark/comparison/dna_r10.4.1_5kHz/s_aureus/comparison_w0_readQuality.csv",
99+
}
74100
}
75101
}
76102

0 commit comments

Comments
 (0)