Skip to content

Commit dd4e234

Browse files
New version of basic benchmarks
1 parent d8bee9f commit dd4e234

19 files changed

+169
-173
lines changed

.github/workflows/benchmarks.yml

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,30 +28,19 @@ jobs:
2828
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
2929
pip install -e .
3030
31-
- name: Create results directory
32-
run: mkdir -p benchmarks/results benchmarks/results-docs
33-
34-
- name: Run benchmark on 50 dataset
35-
working-directory: benchmarks
36-
run: |
37-
echo "Running benchmark on 50 dataset..."
38-
python benchmark_minflowdecomp.py \
39-
--datasets datasets/small/Mouse.PacBio_reads_50.flow_corrected.grp.gz \
40-
--min-width 1 \
41-
--max-width 100
42-
43-
- name: Generate markdown results for 50 dataset
31+
- name: Run benchmark script
4432
working-directory: benchmarks
45-
run: |
46-
python aggregate_results.py MinFlowDecomp \
47-
--format markdown \
48-
--output results-docs/benchmark_50.md \
49-
--metric mean
33+
run: bash run_Mouse.PacBio_1_perwidth.sh results-docs 120
5034

5135
- name: Commit benchmark results
5236
run: |
5337
git config --local user.email "github-actions[bot]@users.noreply.github.com"
5438
git config --local user.name "github-actions[bot]"
55-
git add benchmarks/results-docs/benchmark_50.md
39+
git add benchmarks/results-docs/Mouse.PacBio_reads_1_perwidth.flow_corrected.grp.md
5640
git diff --staged --quiet || git commit -m "Update benchmark results [skip ci]"
5741
git push
42+
43+
- name: Trigger documentation workflow
44+
run: gh workflow run docs.yml
45+
env:
46+
GH_TOKEN: ${{ github.token }}

benchmarks/BENCHMARK_VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.0.2
1+
1.0.3

benchmarks/benchmark_minflowdecomp.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import argparse
1313
from pathlib import Path
1414
import sys
15-
import time
1615

1716
# Add parent directory to path to import benchmark_utils
1817
sys.path.insert(0, str(Path(__file__).parent))
@@ -53,7 +52,6 @@
5352
SOLVER_OPTIONS = {
5453
'threads': 1,
5554
'log_to_console': 'false',
56-
'time_limit': 300, # 5 minutes per instance
5755
}
5856

5957

@@ -89,7 +87,8 @@ def run_benchmarks(
8987
dataset_paths: list,
9088
min_width: int = None,
9189
max_width: int = None,
92-
results_dir: str = 'results'
90+
results_dir: str = 'results',
91+
time_limit: int = 300,
9392
):
9493
"""
9594
Run benchmarks on specified datasets.
@@ -104,7 +103,14 @@ def run_benchmarks(
104103
Maximum width to include
105104
results_dir : str
106105
Directory to save results
106+
time_limit : int
107+
Solver time limit per instance in seconds
107108
"""
109+
solver_options = {
110+
**SOLVER_OPTIONS,
111+
'time_limit': time_limit,
112+
}
113+
108114
# Initialize components
109115
width_filter = WidthFilter(min_width, max_width)
110116
runner = BenchmarkRunner(
@@ -157,7 +163,7 @@ def run_benchmarks(
157163
graph=graph,
158164
optimization_config_name=config_name,
159165
optimization_options=optimization_options,
160-
solver_options=SOLVER_OPTIONS
166+
solver_options=solver_options
161167
)
162168

163169
all_results.append(result)
@@ -221,6 +227,12 @@ def main():
221227
default='results',
222228
help='Directory to save results (default: results)'
223229
)
230+
parser.add_argument(
231+
'--time-limit',
232+
type=int,
233+
default=300,
234+
help='Solver time limit per instance in seconds (default: 300)'
235+
)
224236

225237
args = parser.parse_args()
226238

@@ -264,13 +276,15 @@ def main():
264276
print(f" Datasets: {len(valid_paths)} file(s)")
265277
print(f" Width range: {args.min_width}-{args.max_width}")
266278
print(f" Optimization configs: {len(OPTIMIZATION_CONFIGS)}")
279+
print(f" Time limit: {args.time_limit}s")
267280
print(f" Results directory: {args.results_dir}")
268281

269282
run_benchmarks(
270283
dataset_paths=valid_paths,
271284
min_width=args.min_width,
272285
max_width=args.max_width,
273-
results_dir=args.results_dir
286+
results_dir=args.results_dir,
287+
time_limit=args.time_limit,
274288
)
275289

276290
print("\n" + "="*80)

benchmarks/create_small_dataset.py

Lines changed: 81 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
#!/usr/bin/env python
22
"""
3-
Create a small dataset by sampling random graphs from a larger dataset.
3+
Create a small dataset by selecting graphs per width from a larger dataset.
4+
5+
For each unique width value found in the input dataset, selects a specified
6+
number of graphs (first ones encountered) and saves them to a new file.
47
58
Usage:
69
python create_small_dataset.py
710
"""
811

9-
import random
1012
import gzip
13+
import argparse
1114
from pathlib import Path
1215
import sys
1316

@@ -16,6 +19,26 @@
1619
from benchmark_utils import DatasetLoader
1720

1821

22+
def derive_output_path(input_path: str, graphs_per_width: int) -> str:
23+
"""Derive output dataset path from input path and sampling parameter."""
24+
input_file = Path(input_path)
25+
suffixes = input_file.suffixes
26+
27+
# Preserve historical format for .grp.gz files.
28+
if suffixes[-2:] == ['.grp', '.gz']:
29+
base_name = input_file.name[:-len('.grp.gz')]
30+
output_name = f"{base_name}_{graphs_per_width}_perwidth.grp.gz"
31+
elif suffixes:
32+
# Keep existing extension for other file types.
33+
ext = ''.join(suffixes)
34+
base_name = input_file.name[:-len(ext)]
35+
output_name = f"{base_name}_{graphs_per_width}_perwidth{ext}"
36+
else:
37+
output_name = f"{input_file.name}_{graphs_per_width}_perwidth"
38+
39+
return str(input_file.with_name(output_name))
40+
41+
1942
def write_graph_to_lines(graph):
2043
"""
2144
Convert a graph back to the .grp format lines.
@@ -50,38 +73,44 @@ def write_graph_to_lines(graph):
5073
def create_small_dataset(
5174
input_path: str,
5275
output_path: str,
53-
num_graphs: int = 50,
54-
seed: int = 42
76+
graphs_per_width: int = 5
5577
):
5678
"""
57-
Sample random graphs from a dataset and save to a new file.
79+
Select graphs from a dataset by taking a fixed number per width value.
5880
5981
Parameters
6082
----------
6183
input_path : str
6284
Path to input dataset
6385
output_path : str
6486
Path to output dataset
65-
num_graphs : int
66-
Number of graphs to sample
67-
seed : int
68-
Random seed for reproducibility
87+
graphs_per_width : int
88+
Number of graphs to select for each unique width value (default: 5)
6989
"""
7090
print(f"Loading graphs from {input_path}...")
7191
loader = DatasetLoader(input_path)
7292
graphs = loader.load_graphs()
7393
print(f"Loaded {len(graphs)} graphs")
7494

75-
# Set random seed for reproducibility
76-
random.seed(seed)
95+
# Group graphs by width
96+
width_groups = {}
97+
for graph in graphs:
98+
width = graph.graph.get('w', 0)
99+
if width not in width_groups:
100+
width_groups[width] = []
101+
width_groups[width].append(graph)
77102

78-
# Sample graphs
79-
if num_graphs >= len(graphs):
80-
print(f"Warning: Requested {num_graphs} graphs but dataset only has {len(graphs)}")
81-
sampled_graphs = graphs
82-
else:
83-
sampled_graphs = random.sample(graphs, num_graphs)
84-
print(f"Sampled {len(sampled_graphs)} random graphs")
103+
print(f"Found {len(width_groups)} unique width values")
104+
105+
# Select first graphs_per_width graphs for each width
106+
sampled_graphs = []
107+
for width in sorted(width_groups.keys()):
108+
graphs_with_width = width_groups[width]
109+
selected = graphs_with_width[:graphs_per_width]
110+
sampled_graphs.extend(selected)
111+
print(f" Width {width}: selected {len(selected)} of {len(graphs_with_width)} graphs")
112+
113+
print(f"\nTotal selected: {len(sampled_graphs)} graphs")
85114

86115
# Create output directory if needed
87116
output_file = Path(output_path)
@@ -119,18 +148,46 @@ def create_small_dataset(
119148

120149
def main():
121150
"""Main function."""
122-
# Configuration
123-
input_path = "datasets/esa2025/Mouse.PacBio_reads.grp.gz"
124-
output_path = "datasets/small/Mouse.PacBio_reads_500.grp.gz"
125-
num_graphs = 500
126-
seed = 42
151+
parser = argparse.ArgumentParser(
152+
description="Create a smaller dataset by sampling a fixed number of graphs per width"
153+
)
154+
parser.add_argument(
155+
"--input",
156+
default="datasets/esa2025/Mouse.PacBio_reads.grp.gz",
157+
help="Path to input dataset (default: %(default)s)",
158+
)
159+
parser.add_argument(
160+
"--graphs-per-width",
161+
type=int,
162+
default=5,
163+
help="Number of graphs to select per width value (default: %(default)s)",
164+
)
165+
parser.add_argument(
166+
"--output",
167+
default=None,
168+
help=(
169+
"Path to output dataset. If omitted, it is derived from --input as "
170+
"<input>_<graphs-per-width>_perwidth with the same extension(s)."
171+
),
172+
)
173+
args = parser.parse_args()
174+
175+
if args.graphs_per_width <= 0:
176+
parser.error("--graphs-per-width must be a positive integer")
177+
178+
input_path = args.input
179+
graphs_per_width = args.graphs_per_width
180+
output_path = args.output or derive_output_path(input_path, graphs_per_width)
127181

128182
print("="*70)
129183
print("Creating small dataset")
130184
print("="*70)
185+
print(f"Input: {input_path}")
186+
print(f"Output: {output_path}")
187+
print(f"Graphs per width: {graphs_per_width}")
131188

132189
try:
133-
create_small_dataset(input_path, output_path, num_graphs, seed)
190+
create_small_dataset(input_path, output_path, graphs_per_width)
134191

135192
# Verify the output
136193
print("\nVerifying output...")
Binary file not shown.
13.5 KB
Binary file not shown.
Binary file not shown.
22.8 KB
Binary file not shown.

benchmarks/datasets/esa2025/README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,10 @@ This dataset contains the following files:
44

55
- **`Mouse.PacBio_reads.grp.gz`**: This is the same file as `Mouse.PacBio_reads.grp.gz` from the [Zenodo dataset](https://doi.org/10.5281/zenodo.13987687) by Andrey Prjibelski and was used in the paper [arXiv:2411.03871](https://doi.org/10.48550/arXiv.2411.03871).
66

7-
- **`Mouse.PacBio_reads.flow_corrected.grp.gz`**: This file was obtained from `Mouse.PacBio_reads.grp.gz` using the `MinErrorFlow` class of flowpaths. For more information, see the [Minimum Error Flow documentation](https://algbio.github.io/flowpaths/minimum-error-flow.html).
7+
- **`Mouse.PacBio_reads.flow_corrected.grp.gz`**: This file was obtained from `Mouse.PacBio_reads.grp.gz` using the `MinErrorFlow` class of flowpaths. For more information, see the [Minimum Error Flow documentation](https://algbio.github.io/flowpaths/minimum-error-flow.html).
8+
9+
- **`*_1_perwidth*` files** (for example, `Mouse.PacBio_reads_1_perwidth.grp.gz` and `Mouse.PacBio_reads_1_perwidth.flow_corrected.grp.gz`): Subsampled datasets containing **1 graph per width value**, taking the first graph encountered for each width.
10+
11+
- **`*_5_perwidth*` files** (for example, `Mouse.PacBio_reads_5_perwidth.grp.gz` and `Mouse.PacBio_reads_5_perwidth.flow_corrected.grp.gz`): Subsampled datasets containing **5 graphs per width value**, taking the first 5 graphs encountered for each width.
12+
13+
These subsets are produced with `benchmarks/create_small_dataset.py` via the `--graphs-per-width` option.
Binary file not shown.

0 commit comments

Comments
 (0)