@@ -2468,3 +2468,177 @@ def test_run_report_tests_failed_triggers_fix(self, pdd_test_environment):
24682468 assert decision .operation == 'fix' , (
24692469 f"Expected 'fix' when tests_failed > 0, got '{ decision .operation } '"
24702470 )
2471+
2472+
2473+ class TestFalsePositiveSuccessBugRegression :
2474+ """
2475+ Regression tests for GitHub issue #210: False positive success when skip_verify=True.
2476+
2477+ Bug scenario (from core dump):
2478+ - User runs sync, 'example' operation runs
2479+ - fingerprint saved with command='example'
2480+ - run_report created with exit_code=0, tests_failed=0, test_hash=None (from example run, not actual tests)
2481+ - User adds new unit tests that would fail
2482+ - User runs `pdd sync --skip-verify`
2483+ - Sync incorrectly reports success without running tests
2484+
2485+ Root causes:
2486+ 1. run_report.test_hash was None, causing staleness detection to fail
2487+ 2. _is_workflow_complete() didn't check if tests were actually run when skip_verify=True
2488+ """
2489+
2490+ def test_skip_verify_with_example_command_should_not_be_complete (self , pdd_test_environment ):
2491+ """
2492+ When fingerprint.command='example' and skip_verify=True, workflow should NOT be complete
2493+ because tests haven't been run yet.
2494+
2495+ This is the core bug: skip_verify=True bypassed the verify check, but the test check
2496+ was also effectively bypassed because the code only checked for verify completion.
2497+ """
2498+ tmp_path = pdd_test_environment
2499+
2500+ Path ("src" ).mkdir (exist_ok = True )
2501+ Path ("tests" ).mkdir (exist_ok = True )
2502+ Path ("examples" ).mkdir (exist_ok = True )
2503+
2504+ # Create all files
2505+ prompt_path = tmp_path / "prompts" / f"{ BASENAME } _{ LANGUAGE } .prompt"
2506+ prompt_hash = create_file (prompt_path , "# Test prompt" )
2507+
2508+ code_path = tmp_path / "src" / f"{ BASENAME } .py"
2509+ code_hash = create_file (code_path , "def foo(): pass" )
2510+
2511+ example_path = tmp_path / "examples" / f"{ BASENAME } _example.py"
2512+ example_hash = create_file (example_path , "# example" )
2513+
2514+ test_path = tmp_path / "tests" / f"test_{ BASENAME } .py"
2515+ test_hash = create_file (test_path , "def test_fail(): assert False # Would fail if run" )
2516+
2517+ # Create run_report from example run (not actual tests)
2518+ # This mimics what happens after 'crash' check when example passes
2519+ create_run_report_file (get_meta_dir () / f"{ BASENAME } _{ LANGUAGE } _run.json" , {
2520+ "timestamp" : "2025-12-18T22:00:00.000000+00:00" , # Newer than fingerprint
2521+ "exit_code" : 0 ,
2522+ "tests_passed" : 1 , # This is from example, not actual unit tests
2523+ "tests_failed" : 0 ,
2524+ "coverage" : 0.0
2525+ # test_hash is None (missing) - this was part of the bug
2526+ })
2527+
2528+ # Create fingerprint with command='example' (tests haven't been run)
2529+ create_fingerprint_file (get_meta_dir () / f"{ BASENAME } _{ LANGUAGE } .json" , {
2530+ "pdd_version" : "0.0.86" ,
2531+ "timestamp" : "2025-12-18T21:59:51.000000+00:00" , # Older than run_report
2532+ "command" : "example" , # Key: this is NOT 'test', 'fix', or 'update'
2533+ "prompt_hash" : prompt_hash ,
2534+ "code_hash" : code_hash ,
2535+ "example_hash" : example_hash ,
2536+ "test_hash" : test_hash ,
2537+ })
2538+
2539+ mock_paths = {
2540+ 'prompt' : prompt_path ,
2541+ 'code' : code_path ,
2542+ 'example' : example_path ,
2543+ 'test' : test_path ,
2544+ }
2545+
2546+ # Test _is_workflow_complete directly with skip_verify=True
2547+ result = _is_workflow_complete (
2548+ paths = mock_paths ,
2549+ skip_tests = False ,
2550+ skip_verify = True , # Key: skip_verify=True but tests should still be required
2551+ basename = BASENAME ,
2552+ language = LANGUAGE
2553+ )
2554+
2555+ assert result == False , (
2556+ "_is_workflow_complete() returned True with skip_verify=True and command='example'.\n "
2557+ "Bug: Tests haven't been run yet (fingerprint.command='example'), "
2558+ "but workflow was considered complete.\n "
2559+ "Expected: False (tests need to run)"
2560+ )
2561+
2562+ def test_sync_returns_test_operation_when_tests_not_run (self , pdd_test_environment ):
2563+ """
2564+ When skip_verify=True but tests haven't been run, sync should return 'test' or 'crash'
2565+ operation, NOT 'nothing'.
2566+
2567+ This reproduces the exact scenario from GitHub issue #210.
2568+ """
2569+ tmp_path = pdd_test_environment
2570+
2571+ Path ("src" ).mkdir (exist_ok = True )
2572+ Path ("tests" ).mkdir (exist_ok = True )
2573+ Path ("examples" ).mkdir (exist_ok = True )
2574+
2575+ # Create all files
2576+ prompt_path = tmp_path / "prompts" / f"{ BASENAME } _{ LANGUAGE } .prompt"
2577+ prompt_hash = create_file (prompt_path , "# Test prompt" )
2578+
2579+ code_path = tmp_path / "src" / f"{ BASENAME } .py"
2580+ code_hash = create_file (code_path , "def foo(): pass" )
2581+
2582+ example_path = tmp_path / "examples" / f"{ BASENAME } _example.py"
2583+ example_hash = create_file (example_path , "# example" )
2584+
2585+ test_path = tmp_path / "tests" / f"test_{ BASENAME } .py"
2586+ test_hash = create_file (test_path , "def test_fail(): assert False" )
2587+
2588+ # Create run_report mimicking example success (not actual tests)
2589+ create_run_report_file (get_meta_dir () / f"{ BASENAME } _{ LANGUAGE } _run.json" , {
2590+ "timestamp" : "2025-12-18T22:00:00.000000+00:00" ,
2591+ "exit_code" : 0 ,
2592+ "tests_passed" : 1 ,
2593+ "tests_failed" : 0 ,
2594+ "coverage" : 0.0
2595+ })
2596+
2597+ # Fingerprint with command='example' - tests never ran
2598+ create_fingerprint_file (get_meta_dir () / f"{ BASENAME } _{ LANGUAGE } .json" , {
2599+ "pdd_version" : "0.0.86" ,
2600+ "timestamp" : "2025-12-18T21:59:51.000000+00:00" ,
2601+ "command" : "example" ,
2602+ "prompt_hash" : prompt_hash ,
2603+ "code_hash" : code_hash ,
2604+ "example_hash" : example_hash ,
2605+ "test_hash" : test_hash ,
2606+ })
2607+
2608+ mock_paths = {
2609+ 'prompt' : prompt_path ,
2610+ 'code' : code_path ,
2611+ 'example' : example_path ,
2612+ 'test' : test_path ,
2613+ }
2614+
2615+ with patch ('sync_determine_operation.construct_paths' ) as mock_construct , \
2616+ patch ('sync_determine_operation.get_pdd_file_paths' ) as mock_get_paths :
2617+ mock_construct .return_value = (
2618+ {'prompt_file' : str (prompt_path )},
2619+ {'output' : str (code_path )},
2620+ {'output' : str (test_path )},
2621+ {'output' : str (example_path )}
2622+ )
2623+ mock_get_paths .return_value = mock_paths
2624+
2625+ decision = sync_determine_operation (
2626+ basename = BASENAME ,
2627+ language = LANGUAGE ,
2628+ target_coverage = TARGET_COVERAGE ,
2629+ prompts_dir = "prompts" ,
2630+ skip_tests = False ,
2631+ skip_verify = True , # Skip verify but tests should still run
2632+ )
2633+
2634+ assert decision .operation != 'nothing' , (
2635+ f"Bug reproduced: Sync returned 'nothing' with skip_verify=True and command='example'.\n "
2636+ f"Expected: 'crash', 'verify', or 'test' (workflow should continue)\n "
2637+ f"Actual: '{ decision .operation } ' - { decision .reason } \n "
2638+ f"This is GitHub issue #210: False positive success"
2639+ )
2640+
2641+ # Should be either 'crash' (to validate example), 'verify' (if not skipped), or 'test'
2642+ assert decision .operation in ['crash' , 'verify' , 'test' ], (
2643+ f"Expected 'crash', 'verify', or 'test' to continue workflow, got '{ decision .operation } '"
2644+ )
0 commit comments