algbio
diff --git a/‎.github/workflows/benchmarks.yml‎
Lines changed: 8 additions & 19 deletions b/‎.github/workflows/benchmarks.yml‎
Lines changed: 8 additions & 19 deletions
diff --git a/‎benchmarks/BENCHMARK_VERSION‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/BENCHMARK_VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_minflowdecomp.py‎
Lines changed: 19 additions & 5 deletions b/‎benchmarks/benchmark_minflowdecomp.py‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎benchmarks/create_small_dataset.py‎
Lines changed: 81 additions & 24 deletions b/‎benchmarks/create_small_dataset.py‎
Lines changed: 81 additions & 24 deletions
diff --git a/‎benchmarks/datasets/esa2025/Mouse.PacBio_reads_1_perwidth.flow_corrected.grp.gz‎
13.5 KB b/‎benchmarks/datasets/esa2025/Mouse.PacBio_reads_1_perwidth.flow_corrected.grp.gz‎
13.5 KB
diff --git a/‎benchmarks/datasets/esa2025/Mouse.PacBio_reads_1_perwidth.grp.gz‎
13.5 KB b/‎benchmarks/datasets/esa2025/Mouse.PacBio_reads_1_perwidth.grp.gz‎
13.5 KB
diff --git a/‎benchmarks/datasets/esa2025/Mouse.PacBio_reads_5_perwidth.flow_corrected.grp.gz‎
22.8 KB b/‎benchmarks/datasets/esa2025/Mouse.PacBio_reads_5_perwidth.flow_corrected.grp.gz‎
22.8 KB
diff --git a/‎benchmarks/datasets/esa2025/Mouse.PacBio_reads_5_perwidth.grp.gz‎
22.8 KB b/‎benchmarks/datasets/esa2025/Mouse.PacBio_reads_5_perwidth.grp.gz‎
22.8 KB
diff --git a/‎benchmarks/datasets/esa2025/README.md‎
Lines changed: 7 additions & 1 deletion b/‎benchmarks/datasets/esa2025/README.md‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎benchmarks/datasets/small/Mouse.PacBio_reads_50.flow_corrected.grp.gz‎
-2.58 KB b/‎benchmarks/datasets/small/Mouse.PacBio_reads_50.flow_corrected.grp.gz‎
-2.58 KB
@@ -28,30 +28,19 @@ jobs:
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
         pip install -e .
     
-    - name: Create results directory
-      run: mkdir -p benchmarks/results benchmarks/results-docs
-    
-    - name: Run benchmark on 50 dataset
-      working-directory: benchmarks
-      run: |
-        echo "Running benchmark on 50 dataset..."
-        python benchmark_minflowdecomp.py \
-          --datasets datasets/small/Mouse.PacBio_reads_50.flow_corrected.grp.gz \
-          --min-width 1 \
-          --max-width 100
-    
-    - name: Generate markdown results for 50 dataset
+    - name: Run benchmark script
       working-directory: benchmarks
-      run: |
-        python aggregate_results.py MinFlowDecomp \
-          --format markdown \
-          --output results-docs/benchmark_50.md \
-          --metric mean
+      run: bash run_Mouse.PacBio_1_perwidth.sh results-docs 120
 
     - name: Commit benchmark results
       run: |
         git config --local user.email "github-actions[bot]@users.noreply.github.com"
         git config --local user.name "github-actions[bot]"
-        git add benchmarks/results-docs/benchmark_50.md
+        git add benchmarks/results-docs/Mouse.PacBio_reads_1_perwidth.flow_corrected.grp.md
         git diff --staged --quiet || git commit -m "Update benchmark results [skip ci]"
         git push
+    
+    - name: Trigger documentation workflow
+      run: gh workflow run docs.yml
+      env:
+        GH_TOKEN: ${{ github.token }}
@@ -1 +1 @@
-1.0.2
+1.0.3
@@ -12,7 +12,6 @@
 import argparse
 from pathlib import Path
 import sys
-import time
 
 # Add parent directory to path to import benchmark_utils
 sys.path.insert(0, str(Path(__file__).parent))
@@ -53,7 +52,6 @@
 SOLVER_OPTIONS = {
     'threads': 1,
     'log_to_console': 'false',
-    'time_limit': 300,  # 5 minutes per instance
 }
 
 
@@ -89,7 +87,8 @@ def run_benchmarks(
     dataset_paths: list,
     min_width: int = None,
     max_width: int = None,
-    results_dir: str = 'results'
+    results_dir: str = 'results',
+    time_limit: int = 300,
 ):
     """
     Run benchmarks on specified datasets.
@@ -104,7 +103,14 @@ def run_benchmarks(
         Maximum width to include
     results_dir : str
         Directory to save results
+    time_limit : int
+        Solver time limit per instance in seconds
     """
+    solver_options = {
+        **SOLVER_OPTIONS,
+        'time_limit': time_limit,
+    }
+
     # Initialize components
     width_filter = WidthFilter(min_width, max_width)
     runner = BenchmarkRunner(
@@ -157,7 +163,7 @@ def run_benchmarks(
                     graph=graph,
                     optimization_config_name=config_name,
                     optimization_options=optimization_options,
-                    solver_options=SOLVER_OPTIONS
+                    solver_options=solver_options
                 )
 
                 all_results.append(result)
@@ -221,6 +227,12 @@ def main():
         default='results',
         help='Directory to save results (default: results)'
     )
+    parser.add_argument(
+        '--time-limit',
+        type=int,
+        default=300,
+        help='Solver time limit per instance in seconds (default: 300)'
+    )
 
     args = parser.parse_args()
 
@@ -264,13 +276,15 @@ def main():
     print(f"  Datasets: {len(valid_paths)} file(s)")
     print(f"  Width range: {args.min_width}-{args.max_width}")
     print(f"  Optimization configs: {len(OPTIMIZATION_CONFIGS)}")
+    print(f"  Time limit: {args.time_limit}s")
     print(f"  Results directory: {args.results_dir}")
 
     run_benchmarks(
         dataset_paths=valid_paths,
         min_width=args.min_width,
         max_width=args.max_width,
-        results_dir=args.results_dir
+        results_dir=args.results_dir,
+        time_limit=args.time_limit,
     )
 
     print("\n" + "="*80)
 
@@ -1,13 +1,16 @@
 #!/usr/bin/env python
 """
-Create a small dataset by sampling random graphs from a larger dataset.
+Create a small dataset by selecting graphs per width from a larger dataset.
+
+For each unique width value found in the input dataset, selects a specified
+number of graphs (first ones encountered) and saves them to a new file.
 
 Usage:
     python create_small_dataset.py
 """
 
-import random
 import gzip
+import argparse
 from pathlib import Path
 import sys
 
@@ -16,6 +19,26 @@
 from benchmark_utils import DatasetLoader
 
 
+def derive_output_path(input_path: str, graphs_per_width: int) -> str:
+    """Derive output dataset path from input path and sampling parameter."""
+    input_file = Path(input_path)
+    suffixes = input_file.suffixes
+
+    # Preserve historical format for .grp.gz files.
+    if suffixes[-2:] == ['.grp', '.gz']:
+        base_name = input_file.name[:-len('.grp.gz')]
+        output_name = f"{base_name}_{graphs_per_width}_perwidth.grp.gz"
+    elif suffixes:
+        # Keep existing extension for other file types.
+        ext = ''.join(suffixes)
+        base_name = input_file.name[:-len(ext)]
+        output_name = f"{base_name}_{graphs_per_width}_perwidth{ext}"
+    else:
+        output_name = f"{input_file.name}_{graphs_per_width}_perwidth"
+
+    return str(input_file.with_name(output_name))
+
+
 def write_graph_to_lines(graph):
     """
     Convert a graph back to the .grp format lines.
@@ -50,38 +73,44 @@ def write_graph_to_lines(graph):
 def create_small_dataset(
     input_path: str,
     output_path: str,
-    num_graphs: int = 50,
-    seed: int = 42
+    graphs_per_width: int = 5
 ):
     """
-    Sample random graphs from a dataset and save to a new file.
+    Select graphs from a dataset by taking a fixed number per width value.
     
     Parameters
     ----------
     input_path : str
         Path to input dataset
     output_path : str
         Path to output dataset
-    num_graphs : int
-        Number of graphs to sample
-    seed : int
-        Random seed for reproducibility
+    graphs_per_width : int
+        Number of graphs to select for each unique width value (default: 5)
     """
     print(f"Loading graphs from {input_path}...")
     loader = DatasetLoader(input_path)
     graphs = loader.load_graphs()
     print(f"Loaded {len(graphs)} graphs")
 
-    # Set random seed for reproducibility
-    random.seed(seed)
+    # Group graphs by width
+    width_groups = {}
+    for graph in graphs:
+        width = graph.graph.get('w', 0)
+        if width not in width_groups:
+            width_groups[width] = []
+        width_groups[width].append(graph)
 
-    # Sample graphs
-    if num_graphs >= len(graphs):
-        print(f"Warning: Requested {num_graphs} graphs but dataset only has {len(graphs)}")
-        sampled_graphs = graphs
-    else:
-        sampled_graphs = random.sample(graphs, num_graphs)
-        print(f"Sampled {len(sampled_graphs)} random graphs")
+    print(f"Found {len(width_groups)} unique width values")
+    
+    # Select first graphs_per_width graphs for each width
+    sampled_graphs = []
+    for width in sorted(width_groups.keys()):
+        graphs_with_width = width_groups[width]
+        selected = graphs_with_width[:graphs_per_width]
+        sampled_graphs.extend(selected)
+        print(f"  Width {width}: selected {len(selected)} of {len(graphs_with_width)} graphs")
+    
+    print(f"\nTotal selected: {len(sampled_graphs)} graphs")
 
     # Create output directory if needed
     output_file = Path(output_path)
@@ -119,18 +148,46 @@ def create_small_dataset(
 
 def main():
     """Main function."""
-    # Configuration
-    input_path = "datasets/esa2025/Mouse.PacBio_reads.grp.gz"
-    output_path = "datasets/small/Mouse.PacBio_reads_500.grp.gz"
-    num_graphs = 500
-    seed = 42
+    parser = argparse.ArgumentParser(
+        description="Create a smaller dataset by sampling a fixed number of graphs per width"
+    )
+    parser.add_argument(
+        "--input",
+        default="datasets/esa2025/Mouse.PacBio_reads.grp.gz",
+        help="Path to input dataset (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--graphs-per-width",
+        type=int,
+        default=5,
+        help="Number of graphs to select per width value (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--output",
+        default=None,
+        help=(
+            "Path to output dataset. If omitted, it is derived from --input as "
+            "<input>_<graphs-per-width>_perwidth with the same extension(s)."
+        ),
+    )
+    args = parser.parse_args()
+
+    if args.graphs_per_width <= 0:
+        parser.error("--graphs-per-width must be a positive integer")
+
+    input_path = args.input
+    graphs_per_width = args.graphs_per_width
+    output_path = args.output or derive_output_path(input_path, graphs_per_width)
 
     print("="*70)
     print("Creating small dataset")
     print("="*70)
+    print(f"Input: {input_path}")
+    print(f"Output: {output_path}")
+    print(f"Graphs per width: {graphs_per_width}")
 
     try:
-        create_small_dataset(input_path, output_path, num_graphs, seed)
+        create_small_dataset(input_path, output_path, graphs_per_width)
 
         # Verify the output
         print("\nVerifying output...")
 
@@ -4,4 +4,10 @@ This dataset contains the following files:
 
 - **`Mouse.PacBio_reads.grp.gz`**: This is the same file as `Mouse.PacBio_reads.grp.gz` from the [Zenodo dataset](https://doi.org/10.5281/zenodo.13987687) by Andrey Prjibelski and was used in the paper [arXiv:2411.03871](https://doi.org/10.48550/arXiv.2411.03871).
 
-- **`Mouse.PacBio_reads.flow_corrected.grp.gz`**: This file was obtained from `Mouse.PacBio_reads.grp.gz` using the `MinErrorFlow` class of flowpaths. For more information, see the [Minimum Error Flow documentation](https://algbio.github.io/flowpaths/minimum-error-flow.html).
+- **`Mouse.PacBio_reads.flow_corrected.grp.gz`**: This file was obtained from `Mouse.PacBio_reads.grp.gz` using the `MinErrorFlow` class of flowpaths. For more information, see the [Minimum Error Flow documentation](https://algbio.github.io/flowpaths/minimum-error-flow.html).
+
+- **`*_1_perwidth*` files** (for example, `Mouse.PacBio_reads_1_perwidth.grp.gz` and `Mouse.PacBio_reads_1_perwidth.flow_corrected.grp.gz`): Subsampled datasets containing **1 graph per width value**, taking the first graph encountered for each width.
+
+- **`*_5_perwidth*` files** (for example, `Mouse.PacBio_reads_5_perwidth.grp.gz` and `Mouse.PacBio_reads_5_perwidth.flow_corrected.grp.gz`): Subsampled datasets containing **5 graphs per width value**, taking the first 5 graphs encountered for each width.
+
+These subsets are produced with `benchmarks/create_small_dataset.py` via the `--graphs-per-width` option.