Skip to content

Commit 06aa7df

Browse files
committed
Refactor test execution and coverage reporting
1 parent 2f0dacf commit 06aa7df

File tree

10 files changed

+313
-130
lines changed

10 files changed

+313
-130
lines changed

qualityflow/README.md

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,10 @@ The main pipeline handles the complete test generation workflow:
3333
3. **Code Analysis** - Select files for testing (with max_files limit)
3434
4. **LLM Test Generation** - Generate tests using OpenAI/Anthropic/fake providers
3535
5. **Baseline Generation** - Create simple heuristic tests for comparison
36-
6. **Test Execution** - Run both test suites with coverage analysis
37-
7. **Report Generation** - Compare results and generate markdown reports
36+
6. **Agent Test Execution** - Run LLM-generated tests with coverage analysis
37+
7. **Baseline Test Execution** - Run baseline tests with coverage analysis
38+
8. **Coverage Evaluation** - Compare and analyze coverage metrics between approaches
39+
9. **Report Generation** - Generate comprehensive markdown reports with comparisons
3840

3941
### 🔧 Architecture
4042

@@ -55,14 +57,18 @@ The main pipeline handles the complete test generation workflow:
5557
│ │ Generate & Evaluate │ │
5658
│ │ │ │
5759
│ │ 1. Select Input → 2. Fetch Source → 3. Analyze │ │
58-
│ │ 4. Generate (LLM) → 5. Generate (Base) → 6. Run Tests │ │
59-
│ │ 7. Run Tests → 8. Report & Compare │ │
60+
│ │ 4. Generate (LLM) → 5. Generate (Base) → 6. Run Agent │ │
61+
│ │ 7. Run Baseline → 8. Evaluate → 9. Report │ │
6062
│ │ │ │
6163
│ │ Features: max_files control, Path artifacts, metadata │ │
6264
│ └─────────────────────────────────────────────────────────┘ │
6365
└─────────────────────────────────────────────────────────────────┘
6466
```
6567

68+
<p align="center">
69+
<img src="assets/architecture.png" alt="QualityFlow Architecture" width="900" />
70+
</p>
71+
6672
## 🚀 Quick Start
6773

6874
Get QualityFlow running in 3 simple steps:
@@ -302,6 +308,10 @@ After running QualityFlow successfully:
302308
4. **Deploy to Production**: Use cloud orchestration for scale
303309
5. **Set Up Monitoring**: Configure alerts for regression detection
304310

311+
## ⚠️ Known Limitations
312+
313+
- **CHANGED_FILES Strategy**: The `CHANGED_FILES` selection strategy in `analyze_code.py` is currently a stub implementation that falls back to selecting all files. In production, this should use `git diff` to identify modified files for targeted test generation.
314+
305315
## 🆘 Troubleshooting
306316

307317
### Common Issues

qualityflow/pipelines/generate_and_evaluate.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
from steps.analyze_code import analyze_code
6+
from steps.evaluate_coverage import evaluate_coverage
67
from steps.fetch_source import fetch_source
78
from steps.gen_tests_agent import gen_tests_agent
89
from steps.gen_tests_baseline import gen_tests_baseline
@@ -23,7 +24,8 @@ def generate_and_evaluate() -> None:
2324
1. Analyze code to find files needing tests
2425
2. Generate tests using LLM and baseline approaches
2526
3. Run tests and measure coverage
26-
4. Report results for comparison
27+
4. Evaluate and compare coverage metrics
28+
5. Report results for comparison
2729
"""
2830
# Step 1: Resolve source specification
2931
spec = select_input()
@@ -50,11 +52,19 @@ def generate_and_evaluate() -> None:
5052
workspace_dir, baseline_tests_dir, label="baseline"
5153
)
5254

53-
# Step 8: Generate comprehensive report (includes evaluation)
55+
# Step 8: Evaluate coverage metrics
56+
evaluation_metrics = evaluate_coverage(
57+
agent_results,
58+
baseline_results,
59+
commit_sha,
60+
)
61+
62+
# Step 9: Generate comprehensive report
5463
report(
5564
workspace_dir,
5665
commit_sha,
5766
test_summary,
5867
agent_results,
5968
baseline_results,
69+
evaluation_metrics,
6070
)

qualityflow/prompts/unit_test_strict_v2.jinja

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ import tempfile
6969
import os
7070
from contextlib import contextmanager
7171

72-
# Import the module under test
73-
# from {{ file_path.replace('/', '.').replace('.py', '') }} import *
72+
# Import the module under test (adjust import paths as needed)
73+
from {{ file_path.replace('/', '.').replace('.py', '') }} import *
7474

7575
class Test{{ file_path.split('/')[-1].replace('.py', '').title() }}(unittest.TestCase):
7676
"""Comprehensive test suite for {{ file_path }}."""

qualityflow/prompts/unit_test_v1.jinja

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ import pytest
4444
import unittest
4545
from unittest.mock import Mock, patch, MagicMock
4646

47-
# Import the module under test
48-
# from {{ file_path.replace('/', '.').replace('.py', '') }} import *
47+
# Import the module under test (uncomment and adjust as needed)
48+
from {{ file_path.replace('/', '.').replace('.py', '') }} import *
4949

5050
class TestModule(unittest.TestCase):
5151
"""Test suite for {{ file_path }}."""

qualityflow/requirements.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@ jinja2>=3.0.0,<4.0.0
1010
pytest>=7.0.0,<8.0.0
1111
pytest-cov>=4.0.0,<5.0.0
1212
coverage>=7.0.0,<8.0.0
13+
hypothesis>=6.0.0,<7.0.0
1314

1415
# Code Analysis
1516
# ast is built-in, no need to install
1617

1718
# Git Integration
1819
gitpython>=3.1.0,<4.0.0
1920

20-
# LLM Integration (optional)
21-
openai>=1.0.0,<2.0.0 # for OpenAI provider
22-
anthropic>=0.25.0,<1.0.0 # for Anthropic provider
21+
# LLM Integration (optional - code gracefully handles absence)
22+
# Uncomment and install only if using real LLM providers:
23+
# openai>=1.0.0,<2.0.0 # for OpenAI provider
24+
# anthropic>=0.25.0,<1.0.0 # for Anthropic provider

qualityflow/steps/analyze_code.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class SelectionStrategy(str, Enum):
2727
@step
2828
def analyze_code(
2929
workspace_dir: Path,
30-
commit_sha: str,
30+
commit_sha: str, # Used in code_summary for metadata
3131
source_spec: Dict[str, str],
3232
strategy: SelectionStrategy = SelectionStrategy.LOW_COVERAGE,
3333
max_files: int = 10,
@@ -102,6 +102,7 @@ def analyze_code(
102102
"total_files": len(valid_files),
103103
"selection_reason": f"Selected top {len(selected_files)} files using {strategy} strategy",
104104
"complexity_scores": {f: complexity_scores[f] for f in selected_files},
105+
"commit_sha": commit_sha, # Include commit_sha in metadata
105106
}
106107

107108
logger.info(f"Selected {len(selected_files)} files: {selected_files}")
@@ -157,11 +158,16 @@ def _select_files(
157158
return sorted_files[:max_files]
158159

159160
elif strategy == SelectionStrategy.CHANGED_FILES:
160-
# For this demo, just return all files (in real implementation, would use git diff)
161+
# NOTE: CHANGED_FILES strategy is currently a stub implementation
162+
# In production, this should use git diff to identify changed files:
163+
# - Compare current commit against base branch (e.g., main)
164+
# - Filter for Python files that have been modified/added
165+
# - Prioritize files based on change size and complexity
161166
logger.warning(
162-
"CHANGED_FILES strategy not fully implemented, falling back to ALL"
167+
"CHANGED_FILES strategy not fully implemented, falling back to ALL strategy. "
168+
"To implement: use 'git diff --name-only HEAD~1..HEAD' or similar to identify changed files."
163169
)
164170
return files[:max_files]
165171

166-
else:
167-
raise ValueError(f"Unknown selection strategy: {strategy}")
172+
# This should never be reached due to enum validation, but kept for safety
173+
raise ValueError(f"Unknown selection strategy: {strategy}")

qualityflow/steps/fetch_source.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,18 @@ def fetch_source(
7070

7171
except Exception as e:
7272
logger.error(f"Failed to set up local workspace: {e}")
73+
# Clean up any partial workspace on error
74+
if "workspace_dir" in locals():
75+
try:
76+
import shutil
77+
78+
shutil.rmtree(workspace_dir, ignore_errors=True)
79+
logger.info(
80+
f"Cleaned up partial workspace after error: {workspace_dir}"
81+
)
82+
except Exception:
83+
pass
84+
7385
# Fallback to current working directory
7486
workspace_dir = tempfile.mkdtemp(
7587
prefix="qualityflow_fallback_workspace_"

qualityflow/steps/gen_tests_agent.py

Lines changed: 104 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,35 @@ def _get_default_prompt_template() -> str:
321321
def _generate_fake_tests(
322322
file_path: str, source_code: str, max_tests: int
323323
) -> Tuple[str, Dict]:
324-
"""Generate fake/mock tests for development/testing."""
324+
"""Generate fake/mock tests for development/testing.
325+
326+
This generates more realistic-looking tests that attempt to exercise
327+
the actual source code by parsing it for functions and classes.
328+
"""
329+
import ast
330+
331+
# Parse the source code to extract function/class names
332+
try:
333+
tree = ast.parse(source_code)
334+
functions = []
335+
classes = []
336+
337+
for node in ast.walk(tree):
338+
if isinstance(node, ast.FunctionDef) and not node.name.startswith(
339+
"_"
340+
):
341+
functions.append(node.name)
342+
elif isinstance(node, ast.ClassDef):
343+
classes.append(node.name)
344+
except Exception:
345+
# Fallback if parsing fails
346+
functions = []
347+
classes = []
348+
349+
# Generate module name from file path
350+
module_name = file_path.replace("/", ".").replace(".py", "")
351+
class_name = file_path.split("/")[-1].replace(".py", "").title()
352+
325353
test_content = f'''"""
326354
Generated tests for {file_path}
327355
"""
@@ -330,43 +358,78 @@ def _generate_fake_tests(
330358
import unittest
331359
from unittest.mock import Mock, patch, MagicMock
332360
333-
class Test{file_path.split("/")[-1].replace(".py", "").title()}(unittest.TestCase):
361+
# Attempt to import the module under test
362+
try:
363+
from {module_name} import *
364+
except ImportError:
365+
# Handle import errors gracefully for demo purposes
366+
pass
367+
368+
class Test{class_name}(unittest.TestCase):
334369
"""Auto-generated test class for {file_path}."""
335370
371+
def setUp(self):
372+
"""Set up test fixtures."""
373+
self.test_data = {{"sample": "data", "numbers": [1, 2, 3]}}
374+
336375
def test_module_import(self):
337-
"""Test that we can at least validate the test framework."""
338-
# Simple test that always passes to ensure test discovery works
339-
self.assertTrue(True)
340-
341-
def test_basic_functionality(self):
342-
"""Test basic functionality."""
343-
# Mock test demonstrating test execution
344-
result = 1 + 1
345-
self.assertEqual(result, 2)
346-
376+
"""Test that the module can be imported without errors."""
377+
# This test ensures the module structure is valid
378+
self.assertTrue(True, "Module imported successfully")
379+
'''
380+
381+
# Generate tests for discovered functions
382+
for func_name in functions[: max_tests // 2]:
383+
test_content += f'''
384+
def test_{func_name}_basic(self):
385+
"""Test basic functionality of {func_name}."""
386+
# TODO: Add proper test for {func_name}
387+
# This is a placeholder that should exercise the function
388+
try:
389+
# Attempt to call the function with basic parameters
390+
if callable(globals().get('{func_name}')):
391+
# Basic smoke test - at least try to call it
392+
pass
393+
except NameError:
394+
# Function not available in scope
395+
pass
396+
self.assertTrue(True, "Basic test for {func_name}")
397+
'''
398+
399+
# Generate tests for discovered classes
400+
for class_name_found in classes[: max_tests // 3]:
401+
test_content += f'''
402+
def test_{class_name_found.lower()}_instantiation(self):
403+
"""Test that {class_name_found} can be instantiated."""
404+
try:
405+
if '{class_name_found}' in globals():
406+
# Try basic instantiation
407+
# obj = {class_name_found}()
408+
pass
409+
except NameError:
410+
pass
411+
self.assertTrue(True, "Instantiation test for {class_name_found}")
412+
'''
413+
414+
# Add some general coverage tests
415+
test_content += f'''
347416
def test_error_handling(self):
348-
"""Test error handling."""
349-
# Test exception handling
417+
"""Test error handling patterns."""
350418
with self.assertRaises(ValueError):
351419
raise ValueError("Expected test exception")
352420
421+
def test_data_structures(self):
422+
"""Test basic data structure operations."""
423+
data = self.test_data.copy()
424+
self.assertIn("sample", data)
425+
self.assertEqual(len(data["numbers"]), 3)
426+
353427
def test_mock_usage(self):
354428
"""Test mock functionality."""
355-
# Test using mocks
356429
mock_obj = Mock()
357430
mock_obj.method.return_value = "mocked_result"
358431
result = mock_obj.method()
359432
self.assertEqual(result, "mocked_result")
360-
361-
def test_coverage_target(self):
362-
"""Test that generates some coverage."""
363-
# Simple operations to generate coverage
364-
data = {{"key": "value"}}
365-
self.assertIn("key", data)
366-
367-
items = [1, 2, 3, 4, 5]
368-
filtered = [x for x in items if x > 3]
369-
self.assertEqual(len(filtered), 2)
370433
371434
if __name__ == "__main__":
372435
unittest.main()
@@ -508,14 +571,27 @@ def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]:
508571
def _estimate_cost(
509572
tokens_in: int, tokens_out: int, provider: GenerationProvider, model: str
510573
) -> float:
511-
"""Estimate cost based on token usage."""
512-
# Rough cost estimates (would need real pricing)
574+
"""Estimate cost based on token usage.
575+
576+
WARNING: These are hardcoded pricing estimates that will become outdated.
577+
For accurate pricing, refer to the official pricing pages:
578+
- OpenAI: https://openai.com/api/pricing/
579+
- Anthropic: https://www.anthropic.com/pricing
580+
581+
Consider implementing a dynamic pricing lookup or configuration-based approach
582+
for production use.
583+
"""
584+
# NOTE: These are rough estimates based on pricing as of early 2024
585+
# and will likely become outdated as providers update their pricing
513586
if provider == GenerationProvider.OPENAI:
514587
if "gpt-4" in model:
588+
# GPT-4 pricing (approximate, check current rates)
515589
return (tokens_in * 0.00003) + (tokens_out * 0.00006)
516-
else: # gpt-3.5
590+
else: # gpt-3.5 and other models
591+
# GPT-3.5 pricing (approximate, check current rates)
517592
return (tokens_in * 0.0000015) + (tokens_out * 0.000002)
518593
elif provider == GenerationProvider.ANTHROPIC:
594+
# Claude pricing (approximate, check current rates)
519595
return (tokens_in * 0.000008) + (tokens_out * 0.000024)
520596
else:
521597
return 0.0

0 commit comments

Comments
 (0)