Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/scripts/end_to_end_test_futurehouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
def run_test(expected_improvement_pct: int) -> bool:
config = TestConfig(
file_path="src/aviary/common_tags.py",
expected_unit_tests=0, # todo: fix bug https://linear.app/codeflash-ai/issue/CF-921/test-discovery-does-not-work-properly-for-e2e-futurehouse-example for context
expected_unit_tests_count=2,
min_improvement_x=0.05,
coverage_expectations=[
CoverageExpectation(
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/end_to_end_test_topological_sort_worktree.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def run_test(expected_improvement_pct: int) -> bool:
expected_lines=[25, 26, 27, 28, 29, 30, 31],
)
],
expected_unit_tests=1,
expected_unit_test_files=1, # Per-function count
)
cwd = (pathlib.Path(__file__).parent.parent.parent / "code_to_optimize").resolve()
return_var = run_codeflash_command(cwd, config, expected_improvement_pct)
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/end_to_end_test_tracer_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def run_test(expected_improvement_pct: int) -> bool:
config = TestConfig(
trace_mode=True,
min_improvement_x=0.1,
expected_unit_tests=0,
expected_unit_tests_count=None, # Tracer creates replay tests dynamically, skip validation
coverage_expectations=[
CoverageExpectation(function_name="funcA", expected_coverage=100.0, expected_lines=[6, 7, 8, 9, 11, 14])
],
Expand Down
51 changes: 44 additions & 7 deletions tests/scripts/end_to_end_test_utilities.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import contextlib
import logging
import os
import pathlib
Expand All @@ -8,6 +9,11 @@
from dataclasses import dataclass, field
from typing import Optional

try:
import tomllib
except ImportError:
import tomli as tomllib


@dataclass
class CoverageExpectation:
Expand All @@ -21,7 +27,10 @@ class TestConfig:
# Make file_path optional when trace_mode is True
file_path: Optional[pathlib.Path] = None
function_name: Optional[str] = None
expected_unit_tests: Optional[int] = None
# Global count: "Discovered X existing unit tests and Y replay tests in Z.Zs at /path"
expected_unit_tests_count: Optional[int] = None
# Per-function count: "Discovered X existing unit test files, Y replay test files, and Z concolic..."
expected_unit_test_files: Optional[int] = None
min_improvement_x: float = 0.1
trace_mode: bool = False
coverage_expectations: list[CoverageExpectation] = field(default_factory=list)
Expand Down Expand Up @@ -129,7 +138,20 @@ def build_command(

if config.function_name:
base_command.extend(["--function", config.function_name])
base_command.extend(["--tests-root", str(test_root), "--module-root", str(cwd)])

# Check if pyproject.toml exists with codeflash config - if so, don't override it
pyproject_path = cwd / "pyproject.toml"
has_codeflash_config = False
if pyproject_path.exists():
with contextlib.suppress(Exception):
with open(pyproject_path, "rb") as f:
pyproject_data = tomllib.load(f)
has_codeflash_config = "tool" in pyproject_data and "codeflash" in pyproject_data["tool"]

# Only pass --tests-root and --module-root if they're not configured in pyproject.toml
if not has_codeflash_config:
base_command.extend(["--tests-root", str(test_root), "--module-root", str(cwd)])

if benchmarks_root:
base_command.extend(["--benchmark", "--benchmarks-root", str(benchmarks_root)])
if config.use_worktree:
Expand Down Expand Up @@ -163,15 +185,30 @@ def validate_output(stdout: str, return_code: int, expected_improvement_pct: int
logging.error(f"Performance improvement rate {improvement_x}x not above {config.min_improvement_x}x")
return False

if config.expected_unit_tests is not None:
unit_test_match = re.search(r"Discovered (\d+) existing unit test file", stdout)
if config.expected_unit_tests_count is not None:
# Match the global test discovery message from optimizer.py which counts test invocations
# Format: "Discovered X existing unit tests and Y replay tests in Z.Zs at /path/to/tests"
unit_test_match = re.search(r"Discovered (\d+) existing unit tests? and \d+ replay tests? in [\d.]+s at", stdout)
if not unit_test_match:
logging.error("Could not find unit test count")
logging.error("Could not find global unit test count")
return False

num_tests = int(unit_test_match.group(1))
if num_tests != config.expected_unit_tests:
logging.error(f"Expected {config.expected_unit_tests} unit tests, found {num_tests}")
if num_tests != config.expected_unit_tests_count:
logging.error(f"Expected {config.expected_unit_tests_count} global unit tests, found {num_tests}")
return False

if config.expected_unit_test_files is not None:
# Match the per-function test discovery message from function_optimizer.py
# Format: "Discovered X existing unit test files, Y replay test files, and Z concolic..."
unit_test_files_match = re.search(r"Discovered (\d+) existing unit test files?", stdout)
if not unit_test_files_match:
logging.error("Could not find per-function unit test file count")
return False

num_test_files = int(unit_test_files_match.group(1))
if num_test_files != config.expected_unit_test_files:
logging.error(f"Expected {config.expected_unit_test_files} unit test files, found {num_test_files}")
return False

if config.coverage_expectations:
Expand Down
Loading