diff --git a/qualityflow/README.md b/qualityflow/README.md
index dd69e58e..febb3f76 100644
--- a/qualityflow/README.md
+++ b/qualityflow/README.md
@@ -33,8 +33,10 @@ The main pipeline handles the complete test generation workflow:
 3. **Code Analysis** - Select files for testing (with max_files limit)
 4. **LLM Test Generation** - Generate tests using OpenAI/Anthropic/fake providers
 5. **Baseline Generation** - Create simple heuristic tests for comparison
-6. **Test Execution** - Run both test suites with coverage analysis
-7. **Report Generation** - Compare results and generate markdown reports
+6. **Agent Test Execution** - Run LLM-generated tests with coverage analysis
+7. **Baseline Test Execution** - Run baseline tests with coverage analysis
+8. **Coverage Evaluation** - Compare and analyze coverage metrics between approaches
+9. **Report Generation** - Generate comprehensive markdown reports with comparisons
 
 ### 🔧 Architecture
 
@@ -55,14 +57,18 @@ The main pipeline handles the complete test generation workflow:
 │  │                Generate & Evaluate                      │   │
 │  │                                                         │   │
 │  │ 1. Select Input    → 2. Fetch Source    → 3. Analyze   │   │
-│  │ 4. Generate (LLM)  → 5. Generate (Base) → 6. Run Tests │   │
-│  │ 7. Run Tests       → 8. Report & Compare               │   │
+│  │ 4. Generate (LLM)  → 5. Generate (Base) → 6. Run Agent │   │
+│  │ 7. Run Baseline    → 8. Evaluate        → 9. Report   │   │
 │  │                                                         │   │
 │  │ Features: max_files control, Path artifacts, metadata  │   │
 │  └─────────────────────────────────────────────────────────┘   │
 └─────────────────────────────────────────────────────────────────┘
 ```
 
+<p align="center">
+  <img src="assets/architecture.png" alt="QualityFlow Architecture" width="900" />
+</p>
+
 ## 🚀 Quick Start
 
 Get QualityFlow running in 3 simple steps:
@@ -302,6 +308,10 @@ After running QualityFlow successfully:
 4. **Deploy to Production**: Use cloud orchestration for scale
 5. **Set Up Monitoring**: Configure alerts for regression detection
 
+## ⚠️ Known Limitations
+
+- **CHANGED_FILES Strategy**: The `CHANGED_FILES` selection strategy in `analyze_code.py` is currently a stub implementation that falls back to selecting all files. In production, this should use `git diff` to identify modified files for targeted test generation.
+
 ## 🆘 Troubleshooting
 
 ### Common Issues
diff --git a/qualityflow/pipelines/generate_and_evaluate.py b/qualityflow/pipelines/generate_and_evaluate.py
index 7050b5bd..b3a710b2 100644
--- a/qualityflow/pipelines/generate_and_evaluate.py
+++ b/qualityflow/pipelines/generate_and_evaluate.py
@@ -3,6 +3,7 @@
 """
 
 from steps.analyze_code import analyze_code
+from steps.evaluate_coverage import evaluate_coverage
 from steps.fetch_source import fetch_source
 from steps.gen_tests_agent import gen_tests_agent
 from steps.gen_tests_baseline import gen_tests_baseline
@@ -23,7 +24,8 @@ def generate_and_evaluate() -> None:
     1. Analyze code to find files needing tests
     2. Generate tests using LLM and baseline approaches
     3. Run tests and measure coverage
-    4. Report results for comparison
+    4. Evaluate and compare coverage metrics
+    5. Report results for comparison
     """
     # Step 1: Resolve source specification
     spec = select_input()
@@ -50,11 +52,19 @@ def generate_and_evaluate() -> None:
         workspace_dir, baseline_tests_dir, label="baseline"
     )
 
-    # Step 8: Generate comprehensive report (includes evaluation)
+    # Step 8: Evaluate coverage metrics
+    evaluation_metrics = evaluate_coverage(
+        agent_results,
+        baseline_results,
+        commit_sha,
+    )
+
+    # Step 9: Generate comprehensive report
     report(
         workspace_dir,
         commit_sha,
         test_summary,
         agent_results,
         baseline_results,
+        evaluation_metrics,
     )
diff --git a/qualityflow/prompts/unit_test_strict_v2.jinja b/qualityflow/prompts/unit_test_strict_v2.jinja
index 32dd2643..5446ce8e 100644
--- a/qualityflow/prompts/unit_test_strict_v2.jinja
+++ b/qualityflow/prompts/unit_test_strict_v2.jinja
@@ -69,8 +69,8 @@ import tempfile
 import os
 from contextlib import contextmanager
 
-# Import the module under test
-# from {{ file_path.replace('/', '.').replace('.py', '') }} import *
+# Import the module under test (adjust import paths as needed)
+from {{ file_path.replace('/', '.').replace('.py', '') }} import *
 
 class Test{{ file_path.split('/')[-1].replace('.py', '').title() }}(unittest.TestCase):
     """Comprehensive test suite for {{ file_path }}."""
diff --git a/qualityflow/prompts/unit_test_v1.jinja b/qualityflow/prompts/unit_test_v1.jinja
index 1c1cd444..ae3aa374 100644
--- a/qualityflow/prompts/unit_test_v1.jinja
+++ b/qualityflow/prompts/unit_test_v1.jinja
@@ -44,8 +44,8 @@ import pytest
 import unittest
 from unittest.mock import Mock, patch, MagicMock
 
-# Import the module under test
-# from {{ file_path.replace('/', '.').replace('.py', '') }} import *
+# Import the module under test (uncomment and adjust as needed)
+from {{ file_path.replace('/', '.').replace('.py', '') }} import *
 
 class TestModule(unittest.TestCase):
     """Test suite for {{ file_path }}."""
diff --git a/qualityflow/requirements.txt b/qualityflow/requirements.txt
index 2d3f977e..e3baddb7 100644
--- a/qualityflow/requirements.txt
+++ b/qualityflow/requirements.txt
@@ -10,6 +10,7 @@ jinja2>=3.0.0,<4.0.0
 pytest>=7.0.0,<8.0.0
 pytest-cov>=4.0.0,<5.0.0
 coverage>=7.0.0,<8.0.0
+hypothesis>=6.0.0,<7.0.0
 
 # Code Analysis
 # ast is built-in, no need to install
@@ -17,6 +18,7 @@ coverage>=7.0.0,<8.0.0
 # Git Integration
 gitpython>=3.1.0,<4.0.0
 
-# LLM Integration (optional)
-openai>=1.0.0,<2.0.0  # for OpenAI provider
-anthropic>=0.25.0,<1.0.0  # for Anthropic provider
\ No newline at end of file
+# LLM Integration (optional - code gracefully handles absence)
+# Uncomment and install only if using real LLM providers:
+# openai>=1.0.0,<2.0.0  # for OpenAI provider
+# anthropic>=0.25.0,<1.0.0  # for Anthropic provider
\ No newline at end of file
diff --git a/qualityflow/steps/analyze_code.py b/qualityflow/steps/analyze_code.py
index 7cc5822c..7b92c186 100644
--- a/qualityflow/steps/analyze_code.py
+++ b/qualityflow/steps/analyze_code.py
@@ -27,7 +27,7 @@ class SelectionStrategy(str, Enum):
 @step
 def analyze_code(
     workspace_dir: Path,
-    commit_sha: str,
+    commit_sha: str,  # Used in code_summary for metadata
     source_spec: Dict[str, str],
     strategy: SelectionStrategy = SelectionStrategy.LOW_COVERAGE,
     max_files: int = 10,
@@ -102,6 +102,7 @@ def analyze_code(
         "total_files": len(valid_files),
         "selection_reason": f"Selected top {len(selected_files)} files using {strategy} strategy",
         "complexity_scores": {f: complexity_scores[f] for f in selected_files},
+        "commit_sha": commit_sha,  # Include commit_sha in metadata
     }
 
     logger.info(f"Selected {len(selected_files)} files: {selected_files}")
@@ -157,11 +158,16 @@ def _select_files(
         return sorted_files[:max_files]
 
     elif strategy == SelectionStrategy.CHANGED_FILES:
-        # For this demo, just return all files (in real implementation, would use git diff)
+        # NOTE: CHANGED_FILES strategy is currently a stub implementation
+        # In production, this should use git diff to identify changed files:
+        # - Compare current commit against base branch (e.g., main)
+        # - Filter for Python files that have been modified/added
+        # - Prioritize files based on change size and complexity
         logger.warning(
-            "CHANGED_FILES strategy not fully implemented, falling back to ALL"
+            "CHANGED_FILES strategy not fully implemented, falling back to ALL strategy. "
+            "To implement: use 'git diff --name-only HEAD~1..HEAD' or similar to identify changed files."
         )
         return files[:max_files]
 
-    else:
-        raise ValueError(f"Unknown selection strategy: {strategy}")
+    # This should never be reached due to enum validation, but kept for safety
+    raise ValueError(f"Unknown selection strategy: {strategy}")
diff --git a/qualityflow/steps/fetch_source.py b/qualityflow/steps/fetch_source.py
index cdf37548..99825276 100644
--- a/qualityflow/steps/fetch_source.py
+++ b/qualityflow/steps/fetch_source.py
@@ -70,6 +70,18 @@ def fetch_source(
 
         except Exception as e:
             logger.error(f"Failed to set up local workspace: {e}")
+            # Clean up any partial workspace on error
+            if "workspace_dir" in locals():
+                try:
+                    import shutil
+
+                    shutil.rmtree(workspace_dir, ignore_errors=True)
+                    logger.info(
+                        f"Cleaned up partial workspace after error: {workspace_dir}"
+                    )
+                except Exception:
+                    pass
+
             # Fallback to current working directory
             workspace_dir = tempfile.mkdtemp(
                 prefix="qualityflow_fallback_workspace_"
diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py
index de879731..7fe81cba 100644
--- a/qualityflow/steps/gen_tests_agent.py
+++ b/qualityflow/steps/gen_tests_agent.py
@@ -321,7 +321,35 @@ def _get_default_prompt_template() -> str:
 def _generate_fake_tests(
     file_path: str, source_code: str, max_tests: int
 ) -> Tuple[str, Dict]:
-    """Generate fake/mock tests for development/testing."""
+    """Generate fake/mock tests for development/testing.
+
+    This generates more realistic-looking tests that attempt to exercise
+    the actual source code by parsing it for functions and classes.
+    """
+    import ast
+
+    # Parse the source code to extract function/class names
+    try:
+        tree = ast.parse(source_code)
+        functions = []
+        classes = []
+
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef) and not node.name.startswith(
+                "_"
+            ):
+                functions.append(node.name)
+            elif isinstance(node, ast.ClassDef):
+                classes.append(node.name)
+    except Exception:
+        # Fallback if parsing fails
+        functions = []
+        classes = []
+
+    # Generate module name from file path
+    module_name = file_path.replace("/", ".").replace(".py", "")
+    class_name = file_path.split("/")[-1].replace(".py", "").title()
+
     test_content = f'''"""
 Generated tests for {file_path}
 """
@@ -330,43 +358,78 @@ def _generate_fake_tests(
 import unittest
 from unittest.mock import Mock, patch, MagicMock
 
-class Test{file_path.split("/")[-1].replace(".py", "").title()}(unittest.TestCase):
+# Attempt to import the module under test
+try:
+    from {module_name} import *
+except ImportError:
+    # Handle import errors gracefully for demo purposes
+    pass
+
+class Test{class_name}(unittest.TestCase):
     """Auto-generated test class for {file_path}."""
     
+    def setUp(self):
+        """Set up test fixtures."""
+        self.test_data = {{"sample": "data", "numbers": [1, 2, 3]}}
+    
     def test_module_import(self):
-        """Test that we can at least validate the test framework."""
-        # Simple test that always passes to ensure test discovery works
-        self.assertTrue(True)
-        
-    def test_basic_functionality(self):
-        """Test basic functionality."""
-        # Mock test demonstrating test execution
-        result = 1 + 1
-        self.assertEqual(result, 2)
-        
+        """Test that the module can be imported without errors."""
+        # This test ensures the module structure is valid
+        self.assertTrue(True, "Module imported successfully")
+'''
+
+    # Generate tests for discovered functions
+    for func_name in functions[: max_tests // 2]:
+        test_content += f'''
+    def test_{func_name}_basic(self):
+        """Test basic functionality of {func_name}."""
+        # TODO: Add proper test for {func_name}
+        # This is a placeholder that should exercise the function
+        try:
+            # Attempt to call the function with basic parameters
+            if callable(globals().get('{func_name}')):
+                # Basic smoke test - at least try to call it
+                pass
+        except NameError:
+            # Function not available in scope
+            pass
+        self.assertTrue(True, "Basic test for {func_name}")
+'''
+
+    # Generate tests for discovered classes
+    for class_name_found in classes[: max_tests // 3]:
+        test_content += f'''
+    def test_{class_name_found.lower()}_instantiation(self):
+        """Test that {class_name_found} can be instantiated."""
+        try:
+            if '{class_name_found}' in globals():
+                # Try basic instantiation
+                # obj = {class_name_found}()
+                pass
+        except NameError:
+            pass
+        self.assertTrue(True, "Instantiation test for {class_name_found}")
+'''
+
+    # Add some general coverage tests
+    test_content += f'''
     def test_error_handling(self):
-        """Test error handling."""
-        # Test exception handling
+        """Test error handling patterns."""
         with self.assertRaises(ValueError):
             raise ValueError("Expected test exception")
             
+    def test_data_structures(self):
+        """Test basic data structure operations."""
+        data = self.test_data.copy()
+        self.assertIn("sample", data)
+        self.assertEqual(len(data["numbers"]), 3)
+        
     def test_mock_usage(self):
         """Test mock functionality."""
-        # Test using mocks
         mock_obj = Mock()
         mock_obj.method.return_value = "mocked_result"
         result = mock_obj.method()
         self.assertEqual(result, "mocked_result")
-        
-    def test_coverage_target(self):
-        """Test that generates some coverage."""
-        # Simple operations to generate coverage
-        data = {{"key": "value"}}
-        self.assertIn("key", data)
-        
-        items = [1, 2, 3, 4, 5]
-        filtered = [x for x in items if x > 3]
-        self.assertEqual(len(filtered), 2)
 
 if __name__ == "__main__":
     unittest.main()
@@ -508,14 +571,27 @@ def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]:
 def _estimate_cost(
     tokens_in: int, tokens_out: int, provider: GenerationProvider, model: str
 ) -> float:
-    """Estimate cost based on token usage."""
-    # Rough cost estimates (would need real pricing)
+    """Estimate cost based on token usage.
+
+    WARNING: These are hardcoded pricing estimates that will become outdated.
+    For accurate pricing, refer to the official pricing pages:
+    - OpenAI: https://openai.com/api/pricing/
+    - Anthropic: https://www.anthropic.com/pricing
+
+    Consider implementing a dynamic pricing lookup or configuration-based approach
+    for production use.
+    """
+    # NOTE: These are rough estimates based on pricing as of early 2024
+    # and will likely become outdated as providers update their pricing
     if provider == GenerationProvider.OPENAI:
         if "gpt-4" in model:
+            # GPT-4 pricing (approximate, check current rates)
             return (tokens_in * 0.00003) + (tokens_out * 0.00006)
-        else:  # gpt-3.5
+        else:  # gpt-3.5 and other models
+            # GPT-3.5 pricing (approximate, check current rates)
             return (tokens_in * 0.0000015) + (tokens_out * 0.000002)
     elif provider == GenerationProvider.ANTHROPIC:
+        # Claude pricing (approximate, check current rates)
         return (tokens_in * 0.000008) + (tokens_out * 0.000024)
     else:
         return 0.0
diff --git a/qualityflow/steps/report.py b/qualityflow/steps/report.py
index ab8e564d..38523169 100644
--- a/qualityflow/steps/report.py
+++ b/qualityflow/steps/report.py
@@ -25,6 +25,7 @@ def report(
     test_summary: MarkdownString,
     agent_results: Dict,
     baseline_results: Optional[Dict],
+    evaluation_metrics: Dict,
 ) -> Annotated[MarkdownString, "final_report"]:
     """
     Generate comprehensive markdown report for pipeline execution.
@@ -35,6 +36,7 @@ def report(
         test_summary: Test generation summary with snippets
         agent_results: Agent test results
         baseline_results: Baseline test results (optional)
+        evaluation_metrics: Pre-computed evaluation metrics
 
     Returns:
         Markdown report as string
@@ -46,12 +48,7 @@ def report(
         Path(tempfile.mkdtemp(prefix="qualityflow_report_")) / "report.md"
     )
 
-    # Evaluate coverage metrics first
-    evaluation_metrics = _evaluate_coverage_metrics(
-        agent_results, baseline_results, commit_sha
-    )
-
-    # Generate report content
+    # Generate report content using pre-computed evaluation metrics
     report_content = _generate_report_content(
         workspace_dir,
         commit_sha,
@@ -71,63 +68,6 @@ def report(
     return MarkdownString(report_content)
 
 
-def _evaluate_coverage_metrics(
-    agent_results: Dict,
-    baseline_results: Optional[Dict],
-    commit_sha: str,
-) -> Dict:
-    """Evaluate coverage metrics and compare agent vs baseline approaches."""
-
-    # Extract agent metrics - use actual values from test results
-    coverage_total_agent = agent_results.get("coverage_total", 0.0)
-    tests_passed_agent = agent_results.get("tests_passed", 0)
-    tests_failed_agent = agent_results.get("tests_failed", 0)
-
-    total_tests_agent = tests_passed_agent + tests_failed_agent
-    pass_rate_agent = (
-        tests_passed_agent / total_tests_agent
-        if total_tests_agent > 0
-        else 0.0
-    )
-
-    # Extract baseline metrics
-    coverage_total_baseline = 0.0
-    if baseline_results and not baseline_results.get("skipped", False):
-        coverage_total_baseline = baseline_results.get("coverage_total", 0.0)
-
-    # Compare agent vs baseline coverage
-    coverage_improvement = coverage_total_agent - coverage_total_baseline
-
-    # Analyze coverage quality
-    pass_rate_quality = (
-        "excellent"
-        if pass_rate_agent > 0.95
-        else "good"
-        if pass_rate_agent > 0.8
-        else "needs_improvement"
-    )
-    coverage_quality = (
-        "excellent"
-        if coverage_total_agent > 80
-        else "good"
-        if coverage_total_agent > 50
-        else "needs_improvement"
-    )
-
-    return {
-        "coverage_total_agent": coverage_total_agent,
-        "coverage_total_baseline": coverage_total_baseline,
-        "coverage_improvement": coverage_improvement,
-        "tests_passed_agent": tests_passed_agent,
-        "tests_failed_agent": tests_failed_agent,
-        "pass_rate_agent": pass_rate_agent,
-        "pass_rate_quality": pass_rate_quality,
-        "coverage_quality": coverage_quality,
-        "commit_sha": commit_sha,
-        "files_analyzed": len(agent_results.get("coverage_by_file", {})),
-    }
-
-
 def _generate_report_content(
     workspace_dir: Path,
     commit_sha: str,
diff --git a/qualityflow/steps/run_tests.py b/qualityflow/steps/run_tests.py
index 4ad3edc9..5eb2c642 100644
--- a/qualityflow/steps/run_tests.py
+++ b/qualityflow/steps/run_tests.py
@@ -61,7 +61,25 @@ def run_tests(
     shutil.copytree(tests_dir, workspace_tests_dir)
 
     try:
-        # Run pytest with coverage
+        # Create a temporary coverage config to exclude test directories from coverage
+        coverage_config_file = output_path / ".coveragerc"
+        with open(coverage_config_file, "w") as f:
+            f.write(f"""[run]
+omit = 
+    */tests_*/*
+    *test_*.py
+    */test_*
+    {workspace_tests_dir}/*
+    
+[report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    raise AssertionError
+    raise NotImplementedError
+""")
+
+        # Run pytest with coverage - use custom config to exclude generated tests
         pytest_cmd = [
             "python",
             "-m",
@@ -75,6 +93,8 @@ def run_tests(
             f"xml:{coverage_file}",
             "--cov-report",
             "term",
+            "--cov-config",
+            str(coverage_config_file),
             "-v",
         ]
 
@@ -128,6 +148,18 @@ def run_tests(
 
     except subprocess.TimeoutExpired:
         logger.error(f"Test run for {label} timed out after 5 minutes")
+        # Clean up workspace tests immediately on timeout
+        if workspace_tests_dir.exists():
+            try:
+                shutil.rmtree(workspace_tests_dir)
+                logger.info(
+                    f"Cleaned up test directory after timeout: {workspace_tests_dir}"
+                )
+            except Exception as cleanup_error:
+                logger.warning(
+                    f"Failed to clean up test directory after timeout: {cleanup_error}"
+                )
+
         return {
             "label": label,
             "tests_passed": 0,
@@ -144,6 +176,18 @@ def run_tests(
 
     except Exception as e:
         logger.error(f"Failed to run tests for {label}: {e}")
+        # Clean up workspace tests immediately on error
+        if workspace_tests_dir.exists():
+            try:
+                shutil.rmtree(workspace_tests_dir)
+                logger.info(
+                    f"Cleaned up test directory after error: {workspace_tests_dir}"
+                )
+            except Exception as cleanup_error:
+                logger.warning(
+                    f"Failed to clean up test directory after error: {cleanup_error}"
+                )
+
         return {
             "label": label,
             "tests_passed": 0,
@@ -159,9 +203,28 @@ def run_tests(
         }
 
     finally:
-        # Clean up copied tests
+        # Clean up copied tests - use try/except instead of ignore_errors for better logging
         if workspace_tests_dir.exists():
-            shutil.rmtree(workspace_tests_dir, ignore_errors=True)
+            try:
+                shutil.rmtree(workspace_tests_dir)
+                logger.info(
+                    f"Successfully cleaned up test directory: {workspace_tests_dir}"
+                )
+            except Exception as cleanup_error:
+                logger.error(
+                    f"Failed to clean up test directory {workspace_tests_dir}: {cleanup_error}"
+                )
+                # Still try to clean up individual files if directory removal failed
+                try:
+                    for item in workspace_tests_dir.iterdir():
+                        if item.is_file():
+                            item.unlink(missing_ok=True)
+                        elif item.is_dir():
+                            shutil.rmtree(item, ignore_errors=True)
+                except Exception:
+                    logger.warning(
+                        f"Could not clean up individual items in {workspace_tests_dir}"
+                    )
 
 
 def _parse_test_results(
@@ -173,27 +236,37 @@ def _parse_test_results(
 ) -> Dict:
     """Parse test execution results."""
 
-    # Parse pytest output for basic stats
+    # Parse junit.xml first (preferred method), fallback to stdout parsing
     tests_passed = 0
     tests_failed = 0
 
-    if result.stdout:
-        lines = result.stdout.split("\n")
-        for line in lines:
-            if " passed" in line and " failed" in line:
-                # Line like "2 failed, 3 passed in 1.23s"
-                parts = line.split()
-                for i, part in enumerate(parts):
-                    if part == "passed" and i > 0:
-                        tests_passed = int(parts[i - 1])
-                    elif part == "failed" and i > 0:
-                        tests_failed = int(parts[i - 1])
-            elif " passed" in line and "failed" not in line:
-                # Line like "5 passed in 1.23s"
-                parts = line.split()
-                for i, part in enumerate(parts):
-                    if part == "passed" and i > 0:
-                        tests_passed = int(parts[i - 1])
+    if junit_file.exists():
+        tests_passed, tests_failed = _parse_junit_xml(junit_file)
+        logger.info(
+            f"Parsed test results from junit.xml: {tests_passed} passed, {tests_failed} failed"
+        )
+    else:
+        # Fallback to stdout parsing if junit.xml is not available
+        logger.warning(
+            f"junit.xml not found at {junit_file}, falling back to stdout parsing"
+        )
+        if result.stdout:
+            lines = result.stdout.split("\n")
+            for line in lines:
+                if " passed" in line and " failed" in line:
+                    # Line like "2 failed, 3 passed in 1.23s"
+                    parts = line.split()
+                    for i, part in enumerate(parts):
+                        if part == "passed" and i > 0:
+                            tests_passed = int(parts[i - 1])
+                        elif part == "failed" and i > 0:
+                            tests_failed = int(parts[i - 1])
+                elif " passed" in line and "failed" not in line:
+                    # Line like "5 passed in 1.23s"
+                    parts = line.split()
+                    for i, part in enumerate(parts):
+                        if part == "passed" and i > 0:
+                            tests_passed = int(parts[i - 1])
 
     # Parse coverage from XML if available
     coverage_total = 0.0
@@ -217,6 +290,60 @@ def _parse_test_results(
     }
 
 
+def _parse_junit_xml(junit_file: Path) -> tuple[int, int]:
+    """Parse junit.xml file for test results.
+
+    Returns:
+        Tuple of (tests_passed, tests_failed)
+    """
+    try:
+        import xml.etree.ElementTree as ET
+
+        tree = ET.parse(junit_file)
+        root = tree.getroot()
+
+        # JUnit XML can have different formats, handle common ones
+        tests_passed = 0
+        tests_failed = 0
+
+        # Look for testsuite elements
+        for testsuite in root.findall(".//testsuite"):
+            # Get attributes from testsuite
+            passed = (
+                int(testsuite.get("tests", 0))
+                - int(testsuite.get("failures", 0))
+                - int(testsuite.get("errors", 0))
+                - int(testsuite.get("skipped", 0))
+            )
+            failed = int(testsuite.get("failures", 0)) + int(
+                testsuite.get("errors", 0)
+            )
+
+            tests_passed += max(0, passed)  # Ensure non-negative
+            tests_failed += failed
+
+        # If no testsuite found, look for testcases directly
+        if tests_passed == 0 and tests_failed == 0:
+            for testcase in root.findall(".//testcase"):
+                # Check if testcase has failure or error children
+                if (
+                    testcase.find("failure") is not None
+                    or testcase.find("error") is not None
+                ):
+                    tests_failed += 1
+                else:
+                    tests_passed += 1
+
+        logger.info(
+            f"Parsed junit.xml: {tests_passed} passed, {tests_failed} failed"
+        )
+        return tests_passed, tests_failed
+
+    except Exception as e:
+        logger.warning(f"Failed to parse junit.xml: {e}")
+        return 0, 0
+
+
 def _parse_coverage_xml(coverage_file: Path) -> tuple[float, Dict[str, float]]:
     """Parse coverage XML file."""
     try: