Skip to content

Commit c7d113c

Browse files
authored
Merge pull request #211 from jiaminc-cmu/fix-pdd-sync-false-positive-bug
Resolves #210: False positive success in pdd sync with --skip-verify
2 parents 2aca3d4 + 0cd1d7c commit c7d113c

File tree

3 files changed

+189
-2
lines changed

3 files changed

+189
-2
lines changed

pdd/sync_determine_operation.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,14 @@ def _is_workflow_complete(paths: Dict[str, Path], skip_tests: bool = False, skip
733733
if fingerprint and fingerprint.command not in ['verify', 'test', 'fix', 'update']:
734734
return False
735735

736+
# CRITICAL FIX: Check tests have been run (unless skip_tests)
737+
# Without this, workflow would be "complete" after verify even though tests haven't run
738+
# This prevents false positive success when skip_verify=True but tests are still required
739+
if not skip_tests:
740+
fp = read_fingerprint(basename, language)
741+
if fp and fp.command not in ['test', 'fix', 'update']:
742+
return False
743+
736744
return True
737745

738746

pdd/sync_orchestration.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -969,12 +969,15 @@ def __init__(self, rc, out, err):
969969
else:
970970
# No crash - save run report with exit_code=0 so sync_determine_operation
971971
# knows the example was tested and passed (prevents infinite loop)
972+
# Include test_hash for staleness detection
973+
test_hash = calculate_sha256(pdd_files['test']) if pdd_files['test'].exists() else None
972974
report = RunReport(
973975
datetime.datetime.now(datetime.timezone.utc).isoformat(),
974976
exit_code=0,
975977
tests_passed=1,
976978
tests_failed=0,
977-
coverage=0.0
979+
coverage=0.0,
980+
test_hash=test_hash
978981
)
979982
save_run_report(asdict(report), basename, language)
980983
skipped_operations.append('crash')
@@ -1107,7 +1110,9 @@ def __init__(self, rc, out, err):
11071110
cwd=str(pdd_files['example'].parent),
11081111
timeout=60
11091112
)
1110-
report = RunReport(datetime.datetime.now(datetime.timezone.utc).isoformat(), returncode, 1 if returncode==0 else 0, 0 if returncode==0 else 1, 100.0 if returncode==0 else 0.0)
1113+
# Include test_hash for staleness detection
1114+
test_hash = calculate_sha256(pdd_files['test']) if pdd_files['test'].exists() else None
1115+
report = RunReport(datetime.datetime.now(datetime.timezone.utc).isoformat(), returncode, 1 if returncode==0 else 0, 0 if returncode==0 else 1, 100.0 if returncode==0 else 0.0, test_hash=test_hash)
11111116
save_run_report(asdict(report), basename, language)
11121117
except:
11131118
pass

tests/test_sync_determine_operation.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2468,3 +2468,177 @@ def test_run_report_tests_failed_triggers_fix(self, pdd_test_environment):
24682468
assert decision.operation == 'fix', (
24692469
f"Expected 'fix' when tests_failed > 0, got '{decision.operation}'"
24702470
)
2471+
2472+
2473+
class TestFalsePositiveSuccessBugRegression:
2474+
"""
2475+
Regression tests for GitHub issue #210: False positive success when skip_verify=True.
2476+
2477+
Bug scenario (from core dump):
2478+
- User runs sync, 'example' operation runs
2479+
- fingerprint saved with command='example'
2480+
- run_report created with exit_code=0, tests_failed=0, test_hash=None (from example run, not actual tests)
2481+
- User adds new unit tests that would fail
2482+
- User runs `pdd sync --skip-verify`
2483+
- Sync incorrectly reports success without running tests
2484+
2485+
Root causes:
2486+
1. run_report.test_hash was None, causing staleness detection to fail
2487+
2. _is_workflow_complete() didn't check if tests were actually run when skip_verify=True
2488+
"""
2489+
2490+
def test_skip_verify_with_example_command_should_not_be_complete(self, pdd_test_environment):
2491+
"""
2492+
When fingerprint.command='example' and skip_verify=True, workflow should NOT be complete
2493+
because tests haven't been run yet.
2494+
2495+
This is the core bug: skip_verify=True bypassed the verify check, but the test check
2496+
was also effectively bypassed because the code only checked for verify completion.
2497+
"""
2498+
tmp_path = pdd_test_environment
2499+
2500+
Path("src").mkdir(exist_ok=True)
2501+
Path("tests").mkdir(exist_ok=True)
2502+
Path("examples").mkdir(exist_ok=True)
2503+
2504+
# Create all files
2505+
prompt_path = tmp_path / "prompts" / f"{BASENAME}_{LANGUAGE}.prompt"
2506+
prompt_hash = create_file(prompt_path, "# Test prompt")
2507+
2508+
code_path = tmp_path / "src" / f"{BASENAME}.py"
2509+
code_hash = create_file(code_path, "def foo(): pass")
2510+
2511+
example_path = tmp_path / "examples" / f"{BASENAME}_example.py"
2512+
example_hash = create_file(example_path, "# example")
2513+
2514+
test_path = tmp_path / "tests" / f"test_{BASENAME}.py"
2515+
test_hash = create_file(test_path, "def test_fail(): assert False # Would fail if run")
2516+
2517+
# Create run_report from example run (not actual tests)
2518+
# This mimics what happens after 'crash' check when example passes
2519+
create_run_report_file(get_meta_dir() / f"{BASENAME}_{LANGUAGE}_run.json", {
2520+
"timestamp": "2025-12-18T22:00:00.000000+00:00", # Newer than fingerprint
2521+
"exit_code": 0,
2522+
"tests_passed": 1, # This is from example, not actual unit tests
2523+
"tests_failed": 0,
2524+
"coverage": 0.0
2525+
# test_hash is None (missing) - this was part of the bug
2526+
})
2527+
2528+
# Create fingerprint with command='example' (tests haven't been run)
2529+
create_fingerprint_file(get_meta_dir() / f"{BASENAME}_{LANGUAGE}.json", {
2530+
"pdd_version": "0.0.86",
2531+
"timestamp": "2025-12-18T21:59:51.000000+00:00", # Older than run_report
2532+
"command": "example", # Key: this is NOT 'test', 'fix', or 'update'
2533+
"prompt_hash": prompt_hash,
2534+
"code_hash": code_hash,
2535+
"example_hash": example_hash,
2536+
"test_hash": test_hash,
2537+
})
2538+
2539+
mock_paths = {
2540+
'prompt': prompt_path,
2541+
'code': code_path,
2542+
'example': example_path,
2543+
'test': test_path,
2544+
}
2545+
2546+
# Test _is_workflow_complete directly with skip_verify=True
2547+
result = _is_workflow_complete(
2548+
paths=mock_paths,
2549+
skip_tests=False,
2550+
skip_verify=True, # Key: skip_verify=True but tests should still be required
2551+
basename=BASENAME,
2552+
language=LANGUAGE
2553+
)
2554+
2555+
assert result == False, (
2556+
"_is_workflow_complete() returned True with skip_verify=True and command='example'.\n"
2557+
"Bug: Tests haven't been run yet (fingerprint.command='example'), "
2558+
"but workflow was considered complete.\n"
2559+
"Expected: False (tests need to run)"
2560+
)
2561+
2562+
def test_sync_returns_test_operation_when_tests_not_run(self, pdd_test_environment):
2563+
"""
2564+
When skip_verify=True but tests haven't been run, sync should return 'test' or 'crash'
2565+
operation, NOT 'nothing'.
2566+
2567+
This reproduces the exact scenario from GitHub issue #210.
2568+
"""
2569+
tmp_path = pdd_test_environment
2570+
2571+
Path("src").mkdir(exist_ok=True)
2572+
Path("tests").mkdir(exist_ok=True)
2573+
Path("examples").mkdir(exist_ok=True)
2574+
2575+
# Create all files
2576+
prompt_path = tmp_path / "prompts" / f"{BASENAME}_{LANGUAGE}.prompt"
2577+
prompt_hash = create_file(prompt_path, "# Test prompt")
2578+
2579+
code_path = tmp_path / "src" / f"{BASENAME}.py"
2580+
code_hash = create_file(code_path, "def foo(): pass")
2581+
2582+
example_path = tmp_path / "examples" / f"{BASENAME}_example.py"
2583+
example_hash = create_file(example_path, "# example")
2584+
2585+
test_path = tmp_path / "tests" / f"test_{BASENAME}.py"
2586+
test_hash = create_file(test_path, "def test_fail(): assert False")
2587+
2588+
# Create run_report mimicking example success (not actual tests)
2589+
create_run_report_file(get_meta_dir() / f"{BASENAME}_{LANGUAGE}_run.json", {
2590+
"timestamp": "2025-12-18T22:00:00.000000+00:00",
2591+
"exit_code": 0,
2592+
"tests_passed": 1,
2593+
"tests_failed": 0,
2594+
"coverage": 0.0
2595+
})
2596+
2597+
# Fingerprint with command='example' - tests never ran
2598+
create_fingerprint_file(get_meta_dir() / f"{BASENAME}_{LANGUAGE}.json", {
2599+
"pdd_version": "0.0.86",
2600+
"timestamp": "2025-12-18T21:59:51.000000+00:00",
2601+
"command": "example",
2602+
"prompt_hash": prompt_hash,
2603+
"code_hash": code_hash,
2604+
"example_hash": example_hash,
2605+
"test_hash": test_hash,
2606+
})
2607+
2608+
mock_paths = {
2609+
'prompt': prompt_path,
2610+
'code': code_path,
2611+
'example': example_path,
2612+
'test': test_path,
2613+
}
2614+
2615+
with patch('sync_determine_operation.construct_paths') as mock_construct, \
2616+
patch('sync_determine_operation.get_pdd_file_paths') as mock_get_paths:
2617+
mock_construct.return_value = (
2618+
{'prompt_file': str(prompt_path)},
2619+
{'output': str(code_path)},
2620+
{'output': str(test_path)},
2621+
{'output': str(example_path)}
2622+
)
2623+
mock_get_paths.return_value = mock_paths
2624+
2625+
decision = sync_determine_operation(
2626+
basename=BASENAME,
2627+
language=LANGUAGE,
2628+
target_coverage=TARGET_COVERAGE,
2629+
prompts_dir="prompts",
2630+
skip_tests=False,
2631+
skip_verify=True, # Skip verify but tests should still run
2632+
)
2633+
2634+
assert decision.operation != 'nothing', (
2635+
f"Bug reproduced: Sync returned 'nothing' with skip_verify=True and command='example'.\n"
2636+
f"Expected: 'crash', 'verify', or 'test' (workflow should continue)\n"
2637+
f"Actual: '{decision.operation}' - {decision.reason}\n"
2638+
f"This is GitHub issue #210: False positive success"
2639+
)
2640+
2641+
# Should be either 'crash' (to validate example), 'verify' (if not skipped), or 'test'
2642+
assert decision.operation in ['crash', 'verify', 'test'], (
2643+
f"Expected 'crash', 'verify', or 'test' to continue workflow, got '{decision.operation}'"
2644+
)

0 commit comments

Comments
 (0)