Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions qualityflow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@ The main pipeline handles the complete test generation workflow:
3. **Code Analysis** - Select files for testing (with max_files limit)
4. **LLM Test Generation** - Generate tests using OpenAI/Anthropic/fake providers
5. **Baseline Generation** - Create simple heuristic tests for comparison
6. **Test Execution** - Run both test suites with coverage analysis
7. **Report Generation** - Compare results and generate markdown reports
6. **Agent Test Execution** - Run LLM-generated tests with coverage analysis
7. **Baseline Test Execution** - Run baseline tests with coverage analysis
8. **Coverage Evaluation** - Compare and analyze coverage metrics between approaches
9. **Report Generation** - Generate comprehensive markdown reports with comparisons

### 🔧 Architecture

Expand All @@ -55,14 +57,18 @@ The main pipeline handles the complete test generation workflow:
│ │ Generate & Evaluate │ │
│ │ │ │
│ │ 1. Select Input → 2. Fetch Source → 3. Analyze │ │
│ │ 4. Generate (LLM) → 5. Generate (Base) → 6. Run Tests │ │
│ │ 7. Run Tests → 8. Report & Compare │ │
│ │ 4. Generate (LLM) → 5. Generate (Base) → 6. Run Agent │ │
│ │ 7. Run Baseline → 8. Evaluate → 9. Report │ │
│ │ │ │
│ │ Features: max_files control, Path artifacts, metadata │ │
│ └─────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────┘
```

<p align="center">
<img src="assets/architecture.png" alt="QualityFlow Architecture" width="900" />
</p>

## 🚀 Quick Start

Get QualityFlow running in 3 simple steps:
Expand Down Expand Up @@ -302,6 +308,10 @@ After running QualityFlow successfully:
4. **Deploy to Production**: Use cloud orchestration for scale
5. **Set Up Monitoring**: Configure alerts for regression detection

## ⚠️ Known Limitations

- **CHANGED_FILES Strategy**: The `CHANGED_FILES` selection strategy in `analyze_code.py` is currently a stub implementation that falls back to selecting all files. In production, this should use `git diff` to identify modified files for targeted test generation.

## 🆘 Troubleshooting

### Common Issues
Expand Down
14 changes: 12 additions & 2 deletions qualityflow/pipelines/generate_and_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

from steps.analyze_code import analyze_code
from steps.evaluate_coverage import evaluate_coverage
from steps.fetch_source import fetch_source
from steps.gen_tests_agent import gen_tests_agent
from steps.gen_tests_baseline import gen_tests_baseline
Expand All @@ -23,7 +24,8 @@ def generate_and_evaluate() -> None:
1. Analyze code to find files needing tests
2. Generate tests using LLM and baseline approaches
3. Run tests and measure coverage
4. Report results for comparison
4. Evaluate and compare coverage metrics
5. Report results for comparison
"""
# Step 1: Resolve source specification
spec = select_input()
Expand All @@ -50,11 +52,19 @@ def generate_and_evaluate() -> None:
workspace_dir, baseline_tests_dir, label="baseline"
)

# Step 8: Generate comprehensive report (includes evaluation)
# Step 8: Evaluate coverage metrics
evaluation_metrics = evaluate_coverage(
agent_results,
baseline_results,
commit_sha,
)

# Step 9: Generate comprehensive report
report(
workspace_dir,
commit_sha,
test_summary,
agent_results,
baseline_results,
evaluation_metrics,
)
4 changes: 2 additions & 2 deletions qualityflow/prompts/unit_test_strict_v2.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ import tempfile
import os
from contextlib import contextmanager

# Import the module under test
# from {{ file_path.replace('/', '.').replace('.py', '') }} import *
# Import the module under test (adjust import paths as needed)
from {{ file_path.replace('/', '.').replace('.py', '') }} import *

class Test{{ file_path.split('/')[-1].replace('.py', '').title() }}(unittest.TestCase):
"""Comprehensive test suite for {{ file_path }}."""
Expand Down
4 changes: 2 additions & 2 deletions qualityflow/prompts/unit_test_v1.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ import pytest
import unittest
from unittest.mock import Mock, patch, MagicMock

# Import the module under test
# from {{ file_path.replace('/', '.').replace('.py', '') }} import *
# Import the module under test (uncomment and adjust as needed)
from {{ file_path.replace('/', '.').replace('.py', '') }} import *

class TestModule(unittest.TestCase):
"""Test suite for {{ file_path }}."""
Expand Down
8 changes: 5 additions & 3 deletions qualityflow/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ jinja2>=3.0.0,<4.0.0
pytest>=7.0.0,<8.0.0
pytest-cov>=4.0.0,<5.0.0
coverage>=7.0.0,<8.0.0
hypothesis>=6.0.0,<7.0.0

# Code Analysis
# ast is built-in, no need to install

# Git Integration
gitpython>=3.1.0,<4.0.0

# LLM Integration (optional)
openai>=1.0.0,<2.0.0 # for OpenAI provider
anthropic>=0.25.0,<1.0.0 # for Anthropic provider
# LLM Integration (optional - code gracefully handles absence)
# Uncomment and install only if using real LLM providers:
# openai>=1.0.0,<2.0.0 # for OpenAI provider
# anthropic>=0.25.0,<1.0.0 # for Anthropic provider
16 changes: 11 additions & 5 deletions qualityflow/steps/analyze_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class SelectionStrategy(str, Enum):
@step
def analyze_code(
workspace_dir: Path,
commit_sha: str,
commit_sha: str, # Used in code_summary for metadata
source_spec: Dict[str, str],
strategy: SelectionStrategy = SelectionStrategy.LOW_COVERAGE,
max_files: int = 10,
Expand Down Expand Up @@ -102,6 +102,7 @@ def analyze_code(
"total_files": len(valid_files),
"selection_reason": f"Selected top {len(selected_files)} files using {strategy} strategy",
"complexity_scores": {f: complexity_scores[f] for f in selected_files},
"commit_sha": commit_sha, # Include commit_sha in metadata
}

logger.info(f"Selected {len(selected_files)} files: {selected_files}")
Expand Down Expand Up @@ -157,11 +158,16 @@ def _select_files(
return sorted_files[:max_files]

elif strategy == SelectionStrategy.CHANGED_FILES:
# For this demo, just return all files (in real implementation, would use git diff)
# NOTE: CHANGED_FILES strategy is currently a stub implementation
# In production, this should use git diff to identify changed files:
# - Compare current commit against base branch (e.g., main)
# - Filter for Python files that have been modified/added
# - Prioritize files based on change size and complexity
logger.warning(
"CHANGED_FILES strategy not fully implemented, falling back to ALL"
"CHANGED_FILES strategy not fully implemented, falling back to ALL strategy. "
"To implement: use 'git diff --name-only HEAD~1..HEAD' or similar to identify changed files."
)
return files[:max_files]

else:
raise ValueError(f"Unknown selection strategy: {strategy}")
# This should never be reached due to enum validation, but kept for safety
raise ValueError(f"Unknown selection strategy: {strategy}")
12 changes: 12 additions & 0 deletions qualityflow/steps/fetch_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,18 @@ def fetch_source(

except Exception as e:
logger.error(f"Failed to set up local workspace: {e}")
# Clean up any partial workspace on error
if "workspace_dir" in locals():
try:
import shutil

shutil.rmtree(workspace_dir, ignore_errors=True)
logger.info(
f"Cleaned up partial workspace after error: {workspace_dir}"
)
except Exception:
pass

# Fallback to current working directory
workspace_dir = tempfile.mkdtemp(
prefix="qualityflow_fallback_workspace_"
Expand Down
132 changes: 104 additions & 28 deletions qualityflow/steps/gen_tests_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,35 @@ def _get_default_prompt_template() -> str:
def _generate_fake_tests(
file_path: str, source_code: str, max_tests: int
) -> Tuple[str, Dict]:
"""Generate fake/mock tests for development/testing."""
"""Generate fake/mock tests for development/testing.

This generates more realistic-looking tests that attempt to exercise
the actual source code by parsing it for functions and classes.
"""
import ast

# Parse the source code to extract function/class names
try:
tree = ast.parse(source_code)
functions = []
classes = []

for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef) and not node.name.startswith(
"_"
):
functions.append(node.name)
elif isinstance(node, ast.ClassDef):
classes.append(node.name)
except Exception:
# Fallback if parsing fails
functions = []
classes = []

# Generate module name from file path
module_name = file_path.replace("/", ".").replace(".py", "")
class_name = file_path.split("/")[-1].replace(".py", "").title()

test_content = f'''"""
Generated tests for {file_path}
"""
Expand All @@ -330,43 +358,78 @@ def _generate_fake_tests(
import unittest
from unittest.mock import Mock, patch, MagicMock

class Test{file_path.split("/")[-1].replace(".py", "").title()}(unittest.TestCase):
# Attempt to import the module under test
try:
from {module_name} import *
except ImportError:
# Handle import errors gracefully for demo purposes
pass

class Test{class_name}(unittest.TestCase):
"""Auto-generated test class for {file_path}."""

def setUp(self):
"""Set up test fixtures."""
self.test_data = {{"sample": "data", "numbers": [1, 2, 3]}}

def test_module_import(self):
"""Test that we can at least validate the test framework."""
# Simple test that always passes to ensure test discovery works
self.assertTrue(True)

def test_basic_functionality(self):
"""Test basic functionality."""
# Mock test demonstrating test execution
result = 1 + 1
self.assertEqual(result, 2)

"""Test that the module can be imported without errors."""
# This test ensures the module structure is valid
self.assertTrue(True, "Module imported successfully")
'''

# Generate tests for discovered functions
for func_name in functions[: max_tests // 2]:
test_content += f'''
def test_{func_name}_basic(self):
"""Test basic functionality of {func_name}."""
# TODO: Add proper test for {func_name}
# This is a placeholder that should exercise the function
try:
# Attempt to call the function with basic parameters
if callable(globals().get('{func_name}')):
# Basic smoke test - at least try to call it
pass
except NameError:
# Function not available in scope
pass
self.assertTrue(True, "Basic test for {func_name}")
'''

# Generate tests for discovered classes
for class_name_found in classes[: max_tests // 3]:
test_content += f'''
def test_{class_name_found.lower()}_instantiation(self):
"""Test that {class_name_found} can be instantiated."""
try:
if '{class_name_found}' in globals():
# Try basic instantiation
# obj = {class_name_found}()
pass
except NameError:
pass
self.assertTrue(True, "Instantiation test for {class_name_found}")
'''

# Add some general coverage tests
test_content += f'''
def test_error_handling(self):
"""Test error handling."""
# Test exception handling
"""Test error handling patterns."""
with self.assertRaises(ValueError):
raise ValueError("Expected test exception")

def test_data_structures(self):
"""Test basic data structure operations."""
data = self.test_data.copy()
self.assertIn("sample", data)
self.assertEqual(len(data["numbers"]), 3)

def test_mock_usage(self):
"""Test mock functionality."""
# Test using mocks
mock_obj = Mock()
mock_obj.method.return_value = "mocked_result"
result = mock_obj.method()
self.assertEqual(result, "mocked_result")

def test_coverage_target(self):
"""Test that generates some coverage."""
# Simple operations to generate coverage
data = {{"key": "value"}}
self.assertIn("key", data)

items = [1, 2, 3, 4, 5]
filtered = [x for x in items if x > 3]
self.assertEqual(len(filtered), 2)

if __name__ == "__main__":
unittest.main()
Expand Down Expand Up @@ -508,14 +571,27 @@ def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]:
def _estimate_cost(
tokens_in: int, tokens_out: int, provider: GenerationProvider, model: str
) -> float:
"""Estimate cost based on token usage."""
# Rough cost estimates (would need real pricing)
"""Estimate cost based on token usage.

WARNING: These are hardcoded pricing estimates that will become outdated.
For accurate pricing, refer to the official pricing pages:
- OpenAI: https://openai.com/api/pricing/
- Anthropic: https://www.anthropic.com/pricing

Consider implementing a dynamic pricing lookup or configuration-based approach
for production use.
"""
# NOTE: These are rough estimates based on pricing as of early 2024
# and will likely become outdated as providers update their pricing
if provider == GenerationProvider.OPENAI:
if "gpt-4" in model:
# GPT-4 pricing (approximate, check current rates)
return (tokens_in * 0.00003) + (tokens_out * 0.00006)
else: # gpt-3.5
else: # gpt-3.5 and other models
# GPT-3.5 pricing (approximate, check current rates)
return (tokens_in * 0.0000015) + (tokens_out * 0.000002)
elif provider == GenerationProvider.ANTHROPIC:
# Claude pricing (approximate, check current rates)
return (tokens_in * 0.000008) + (tokens_out * 0.000024)
else:
return 0.0
Loading