diff --git a/ADDING_PROJECTS.md b/ADDING_PROJECTS.md index a169560d..35f26d3b 100644 --- a/ADDING_PROJECTS.md +++ b/ADDING_PROJECTS.md @@ -56,8 +56,7 @@ ENV ZENML_ENABLE_TUTORIAL=true ### When No Dockerfile is Needed -If your project only requires Python dependencies listed in `requirements.txt`, **do not include a Dockerfile**. The projects backend will automatically build your project using the generic Dockerfile available at: -[https://github.com/zenml-io/zenml-projects-backend/blob/main/.docker/project.Dockerfile](https://github.com/zenml-io/zenml-projects-backend/blob/main/.docker/project.Dockerfile) +If your project only requires Python dependencies listed in `requirements.txt`, **do not include a Dockerfile**. The projects backend will automatically build your project using the generic Dockerfile available at the zenml-projects-backend repo. ## ๐Ÿ”ง Backend Integration diff --git a/README.md b/README.md index 27b762ba..46accab2 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ etc. | [Nightwatch AI](nightwatch-ai) | ๐Ÿค– LLMOps | ๐Ÿ“ Summarization, ๐Ÿ“Š Reporting | openai, supabase, slack | | [ResearchRadar](research-radar) | ๐Ÿค– LLMOps | ๐Ÿ“ Classification, ๐Ÿ“Š Comparison | anthropic, huggingface, transformers | | [Deep Research](deep_research) | ๐Ÿค– LLMOps | ๐Ÿ“ Research, ๐Ÿ“Š Reporting, ๐Ÿ” Web Search | anthropic, mcp, agents, openai | +| [QualityFlow](qualityflow) | ๐Ÿค– LLMOps | ๐Ÿงช Test Generation, ๐Ÿ“Š Coverage Analysis, โšก Automation | openai, anthropic, pytest, jinja2 | | [End-to-end Computer Vision](end-to-end-computer-vision) | ๐Ÿ‘ CV | ๐Ÿ”Ž Object Detection, ๐Ÿท๏ธ Labeling | pytorch, label_studio, yolov8 | | [Magic Photobooth](magic-photobooth) | ๐Ÿ‘ CV | ๐Ÿ“ท Image Gen, ๐ŸŽž๏ธ Video Gen | stable-diffusion, huggingface | | [OmniReader](omni-reader) | ๐Ÿ‘ CV | ๐Ÿ“‘ OCR, ๐Ÿ“Š Evaluation, โš™๏ธ Batch Processing | polars, litellm, openai, ollama | diff --git a/llm-complete-guide/README.md b/llm-complete-guide/README.md index f352d2bf..7fd23bad 100644 --- a/llm-complete-guide/README.md +++ b/llm-complete-guide/README.md @@ -235,7 +235,7 @@ python run.py synthetic You will also need to have set up and connected to an Argilla instance for this to work. Please follow the instructions in the [Argilla -documentation](https://docs.argilla.io/latest/getting_started/quickstart/) +documentation](https://docs.v1.argilla.io/en/latest/) to set up and connect to an Argilla instance on the Hugging Face Hub. [ZenML's Argilla integration documentation](https://docs.zenml.io/v/docs/stack-components/annotators/argilla) @@ -254,7 +254,7 @@ zenml secret update llm-complete -v '{"argilla_api_key": "YOUR_ARGILLA_API_KEY", As with the previous pipeline, you will need to have set up and connected to an Argilla instance for this to work. Please follow the instructions in the [Argilla -documentation](https://docs.argilla.io/latest/getting_started/quickstart/) +documentation](https://docs.v1.argilla.io/en/latest/) to set up and connect to an Argilla instance on the Hugging Face Hub. [ZenML's Argilla integration documentation](https://docs.zenml.io/v/docs/stack-components/annotators/argilla) diff --git a/qualityflow/README.md b/qualityflow/README.md new file mode 100644 index 00000000..dd69e58e --- /dev/null +++ b/qualityflow/README.md @@ -0,0 +1,332 @@ +# ๐Ÿงช QualityFlow: AI-Powered Test Generation Pipeline + +A streamlined MLOps pipeline for **automated test generation** using ZenML and LLMs. Generate comprehensive unit tests for your codebase, compare different approaches, and get detailed coverage analysis. + +## ๐Ÿš€ Product Overview + +QualityFlow demonstrates how to build production-ready MLOps workflows for automated test generation using Large Language Models. Built with ZenML, it provides a simple yet powerful pipeline for generating and evaluating AI-generated tests. + +**Focus**: **LLM-Powered Test Generation** and **Coverage Analysis**. + +### Key Features + +- **Real LLM Integration**: OpenAI and Anthropic providers for intelligent test generation +- **Smart File Selection**: Configurable strategies to focus on files that need testing +- **Baseline Comparison**: Compare LLM-generated tests vs heuristic baseline tests +- **Coverage Analysis**: Real coverage metrics with detailed reporting +- **Speed Controls**: `max_files` parameters to control pipeline execution time +- **Containerized Ready**: Uses ZenML Path artifacts for remote execution +- **Cost Tracking**: Token usage and cost estimation with metadata logging + +## ๐Ÿ’ก How It Works + +### โœˆ๏ธ Pipeline Architecture + +QualityFlow consists of a single, focused pipeline: + +#### Generate & Evaluate Pipeline + +The main pipeline handles the complete test generation workflow: + +1. **Source Selection** - Specify repository and target files +2. **Code Fetching** - Clone and materialize workspace +3. **Code Analysis** - Select files for testing (with max_files limit) +4. **LLM Test Generation** - Generate tests using OpenAI/Anthropic/fake providers +5. **Baseline Generation** - Create simple heuristic tests for comparison +6. **Test Execution** - Run both test suites with coverage analysis +7. **Report Generation** - Compare results and generate markdown reports + +### ๐Ÿ”ง Architecture + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ Git Repo โ”‚ โ”‚ LLM Providers โ”‚ โ”‚ Test Reports โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ src/**/*.py โ”‚โ”€โ”€โ”€โ”€โ”‚โ–ถ OpenAI/Claude โ”‚โ”€โ”€โ”€โ”€โ”‚โ–ถ Coverage โ”‚ +โ”‚ target files โ”‚ โ”‚ Fake (testing) โ”‚ โ”‚ Comparisons โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ Cost Tracking โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ โ–ฒ + โ–ผ โ”‚ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ QualityFlow Pipeline โ”‚ +โ”‚ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ Generate & Evaluate โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ 1. Select Input โ†’ 2. Fetch Source โ†’ 3. Analyze โ”‚ โ”‚ +โ”‚ โ”‚ 4. Generate (LLM) โ†’ 5. Generate (Base) โ†’ 6. Run Tests โ”‚ โ”‚ +โ”‚ โ”‚ 7. Run Tests โ†’ 8. Report & Compare โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ Features: max_files control, Path artifacts, metadata โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +## ๐Ÿš€ Quick Start + +Get QualityFlow running in 3 simple steps: + +### 1. Install Dependencies +```bash +pip install -r requirements.txt +``` + +### 2. Optional: Set up OpenAI API Key +```bash +export OPENAI_API_KEY="your-api-key-here" +``` +*Skip this step to use the fake provider for testing* + +### 3. Run the Pipeline +```bash +python run.py +``` + +**That's it!** The pipeline will automatically: +- Clone a sample repository (requests library by default) +- Analyze Python files and select test candidates +- Generate tests using LLM or fake provider +- Run tests and measure coverage +- Create a detailed comparison report + +### What Happens Next? + +- Check the ZenML dashboard to see pipeline results +- View generated test files and coverage reports +- Compare LLM vs baseline test approaches +- Experiment with different configurations + +### Local Testing Option + +For offline development or controlled testing, use the local examples: + +```bash +python run.py --config configs/experiment.local.yaml +``` + +This uses the included `examples/toy_lib/` code instead of cloning external repositories. + +## โš™๏ธ Configuration + +### Key Parameters + +You can customize the pipeline behavior by editing `configs/experiment.default.yaml`: + +```yaml +# Control execution speed +steps: + analyze_code: + parameters: + max_files: 3 # Limit files to analyze (faster execution) + + gen_tests_agent: + parameters: + provider: "openai" # openai | anthropic | fake + model: "gpt-4o-mini" + max_files: 2 # Limit files for test generation + max_tests_per_file: 3 + + gen_tests_baseline: + parameters: + max_files: 2 # Match agent for fair comparison +``` + +### Pipeline Options + +```bash +# Default: uses remote repository (requests library) +python run.py + +# Local testing with included examples +python run.py --config configs/experiment.local.yaml + +# High-quality test generation +python run.py --config configs/experiment.strict.yaml + +# Force fresh execution (no caching) +python run.py --no-cache +``` + +## ๐Ÿ”ฌ Advanced Usage + +### Different Target Repositories + +Edit the config to point to your own repository: + +```yaml +steps: + select_input: + parameters: + repo_url: "https://github.com/your-org/your-repo.git" + ref: "main" + target_glob: "src/**/*.py" # Adjust path pattern +``` + +### Custom Prompts + +Create new Jinja2 templates in `prompts/`: + +```jinja2 +# prompts/custom_test_v3.jinja + +Generate {{ max_tests }} tests for: +{{ file_path }} (complexity: {{ complexity_score }}) + +Source: +```python +{{ source_code }} +``` + +Requirements: +- Use pytest fixtures +- Include edge cases +- Mock external dependencies +``` + +### A/B Testing Experiments + +Compare different configurations by running with different config files: + +```bash +# Compare prompt versions +python run.py --config configs/experiment.default.yaml +python run.py --config configs/experiment.strict.yaml + +# Compare results in ZenML dashboard: +# - Coverage metrics +# - Test quality scores +# - Token usage and cost +``` + +### Production Deployment + +Set up ZenML stack for cloud deployment: + +```bash +# Example: AWS EKS stack +zenml artifact-store register s3_store --flavor=s3 --path=s3://your-bucket +zenml container-registry register ecr_registry --flavor=aws --uri=your-account.dkr.ecr.region.amazonaws.com +zenml orchestrator register k8s_orchestrator --flavor=kubernetes --kubernetes_context=your-eks-context + +zenml stack register production_stack \ + -a s3_store -c ecr_registry -o k8s_orchestrator --set +``` + +### Scheduled Execution + +For automated runs, set up scheduled execution using your preferred orchestration tool or ZenML's scheduling features. + +## ๐Ÿ—๏ธ Project Structure + +``` +qualityflow/ +โ”œโ”€โ”€ README.md +โ”œโ”€โ”€ requirements.txt +โ”‚ +โ”œโ”€โ”€ configs/ # Pipeline configurations +โ”‚ โ”œโ”€โ”€ experiment.default.yaml # Standard experiment settings +โ”‚ โ”œโ”€โ”€ experiment.strict.yaml # High-quality gates +โ”‚ โ””โ”€โ”€ experiment.local.yaml # Local examples testing +โ”‚ +โ”œโ”€โ”€ pipelines/ # Pipeline definitions +โ”‚ โ””โ”€โ”€ generate_and_evaluate.py # Main pipeline +โ”‚ +โ”œโ”€โ”€ steps/ # Pipeline steps +โ”‚ โ”œโ”€โ”€ select_input.py # Source specification +โ”‚ โ”œโ”€โ”€ fetch_source.py # Repository fetching +โ”‚ โ”œโ”€โ”€ analyze_code.py # Code analysis & selection +โ”‚ โ”œโ”€โ”€ gen_tests_agent.py # LLM test generation +โ”‚ โ”œโ”€โ”€ gen_tests_baseline.py # Heuristic test generation +โ”‚ โ”œโ”€โ”€ run_tests.py # Test execution & coverage +โ”‚ โ”œโ”€โ”€ evaluate_coverage.py # Metrics evaluation +โ”‚ โ””โ”€โ”€ report.py # Report generation +โ”‚ +โ”œโ”€โ”€ prompts/ # Jinja2 prompt templates +โ”‚ โ”œโ”€โ”€ unit_test_v1.jinja # Standard test generation +โ”‚ โ””โ”€โ”€ unit_test_strict_v2.jinja # Comprehensive test generation +โ”‚ +โ”œโ”€โ”€ examples/ # Demo code for testing +โ”‚ โ””โ”€โ”€ toy_lib/ # Sample library with test-friendly code +โ”‚ โ”œโ”€โ”€ calculator.py # Calculator class with edge cases +โ”‚ โ””โ”€โ”€ string_utils.py # String utilities with validation +โ”‚ +โ””โ”€โ”€ run.py # Main entry point +``` + +### Key Components + +- **Pipeline Steps**: Modular, reusable components with clear interfaces +- **Prompt Templates**: Jinja2 templates for LLM test generation +- **Configuration**: YAML-driven experiment settings +- **Test Generation**: Both LLM-based and heuristic approaches for comparison +- **Example Code**: Sample Python modules (`toy_lib`) designed for effective test generation demonstration + +## ๐Ÿš€ Production Deployment + +### ZenML Cloud Stack Setup + +For production deployment with ZenML Cloud: + +```bash +# Connect to ZenML Cloud +zenml connect --url https://your-org.zenml.cloud + +# Register cloud stack components +zenml artifact-store register cloud_store --flavor=s3 --path=s3://qualityflow-artifacts +zenml orchestrator register cloud_k8s --flavor=kubernetes --kubernetes_context=prod-cluster + +zenml stack register production \ + -a cloud_store -o cloud_k8s --set +``` + +### Scheduled Execution + +Set up automated regression testing using ZenML's scheduling capabilities or your preferred orchestration platform. + +## ๐Ÿค Contributing + +QualityFlow follows ZenML best practices and is designed to be extended: + +1. **Add New LLM Providers**: Extend `gen_tests_agent.py` with new provider integrations +3. **Additional Metrics**: Expand evaluation capabilities with new quality metrics +4. **Selection Strategies**: Add new code selection algorithms + +## ๐Ÿ“ Next Steps + +After running QualityFlow successfully: + +1. **Explore ZenML Dashboard**: View pipeline runs, artifacts, and model registry +2. **Experiment with Prompts**: Try different test generation strategies +3. **Add Real Codebases**: Replace toy examples with your production code +4. **Deploy to Production**: Use cloud orchestration for scale +5. **Set Up Monitoring**: Configure alerts for regression detection + +## ๐Ÿ†˜ Troubleshooting + +### Common Issues + +**LLM API Errors**: +- Set `OPENAI_API_KEY` or `ANTHROPIC_API_KEY` environment variables +- Use `provider: "fake"` for development without API keys + +**Test Execution Failures**: +- Ensure pytest and coverage tools are installed +- Check that workspace has proper Python path setup + +### Debug Mode + +Run with debug logging: + +```bash +export ZENML_LOGGING_VERBOSITY=DEBUG +python run.py --config configs/experiment.default.yaml +``` + +## ๐Ÿ“š Resources + +- [ZenML Documentation](https://docs.zenml.io/) + +--- + +Built with โค๏ธ using [ZenML](https://zenml.io) - *The MLOps Framework for Production AI* \ No newline at end of file diff --git a/qualityflow/configs/experiment.default.yaml b/qualityflow/configs/experiment.default.yaml new file mode 100644 index 00000000..61537368 --- /dev/null +++ b/qualityflow/configs/experiment.default.yaml @@ -0,0 +1,41 @@ +# QualityFlow Default Experiment Configuration +# Production-ready template for automated test generation & validation + +# Pipeline configuration +name: "generate_and_evaluate" +version: "1.0" + +# Source configuration +steps: + select_input: + parameters: + repo_url: "https://github.com/psf/requests.git" + ref: "main" + target_glob: "src/**/*.py" + + analyze_code: + parameters: + strategy: "low_coverage" # low_coverage | changed_files | all + max_files: 3 # Reduced for faster testing + + # LLM generation configuration + gen_tests_agent: + parameters: + provider: "openai" # openai | anthropic | fake + model: "gpt-4o-mini" + prompt_path: "prompts/unit_test_v1.jinja" + max_tests_per_file: 3 + max_files: 2 # Limit files for faster testing + + # Baseline test generation + gen_tests_baseline: + parameters: + enabled: true + max_files: 2 # Match agent max_files for consistency + + # No more evaluation gates or promotion - just simple coverage comparison + +# Resource configuration +settings: + docker: + requirements: requirements.txt \ No newline at end of file diff --git a/qualityflow/configs/experiment.local.yaml b/qualityflow/configs/experiment.local.yaml new file mode 100644 index 00000000..477f089e --- /dev/null +++ b/qualityflow/configs/experiment.local.yaml @@ -0,0 +1,40 @@ +# QualityFlow Local Examples Configuration +# Use local toy_lib examples instead of remote repositories + +# Pipeline configuration +name: "generate_and_evaluate" +version: "1.0" + +# Source configuration - using local examples +steps: + select_input: + parameters: + # Use local examples instead of remote repo + repo_url: "local" + ref: "main" + target_glob: "examples/**/*.py" # Target the toy_lib examples + + analyze_code: + parameters: + strategy: "all" # Include all example files + max_files: 5 # Process all toy_lib files + + # LLM generation configuration + gen_tests_agent: + parameters: + provider: "fake" # Use fake provider by default for local testing + model: "gpt-4o-mini" + prompt_path: "prompts/unit_test_v1.jinja" + max_tests_per_file: 3 + max_files: 5 # Process all toy_lib files + + # Baseline test generation + gen_tests_baseline: + parameters: + enabled: true + max_files: 5 # Match agent max_files + +# Resource configuration +settings: + docker: + requirements: requirements.txt \ No newline at end of file diff --git a/qualityflow/configs/experiment.strict.yaml b/qualityflow/configs/experiment.strict.yaml new file mode 100644 index 00000000..8d1d15ba --- /dev/null +++ b/qualityflow/configs/experiment.strict.yaml @@ -0,0 +1,42 @@ +# QualityFlow Strict Experiment Configuration +# Higher quality gates and strict prompt for comprehensive testing + +# Pipeline configuration +name: "generate_and_evaluate" +version: "1.0" + +# Source configuration +steps: + select_input: + parameters: + repo_url: "https://github.com/psf/requests.git" + ref: "main" + target_glob: "src/**/*.py,tests/**/*.py" + + analyze_code: + parameters: + strategy: "low_coverage" + max_files: 5 # Fewer files for more thorough testing + + # LLM generation with strict prompt + gen_tests_agent: + parameters: + provider: "openai" # openai | anthropic | fake + model: "gpt-4o" # More powerful model + prompt_path: "prompts/unit_test_strict_v2.jinja" + max_tests_per_file: 5 # More tests per file + max_files: 5 # Match analyze_code for consistency + + # Baseline test generation + gen_tests_baseline: + parameters: + enabled: true + max_files: 5 # Match agent for fair comparison + +# Resource configuration with higher limits +settings: + docker: + requirements: requirements.txt + resource_settings: + memory: "4Gi" + cpu_count: 2.0 \ No newline at end of file diff --git a/qualityflow/examples/toy_lib/__init__.py b/qualityflow/examples/toy_lib/__init__.py new file mode 100644 index 00000000..8b91a8dd --- /dev/null +++ b/qualityflow/examples/toy_lib/__init__.py @@ -0,0 +1,5 @@ +""" +QualityFlow toy library example for testing. +""" + +__version__ = "0.1.0" diff --git a/qualityflow/examples/toy_lib/calculator.py b/qualityflow/examples/toy_lib/calculator.py new file mode 100644 index 00000000..38bc9964 --- /dev/null +++ b/qualityflow/examples/toy_lib/calculator.py @@ -0,0 +1,85 @@ +"""Simple calculator module for QualityFlow demonstration.""" + +from typing import Union + + +class Calculator: + """A simple calculator with basic arithmetic operations.""" + + def __init__(self): + """Initialize calculator with empty history.""" + self.history = [] + + def add( + self, a: Union[int, float], b: Union[int, float] + ) -> Union[int, float]: + """Add two numbers.""" + result = a + b + self.history.append(f"{a} + {b} = {result}") + return result + + def subtract( + self, a: Union[int, float], b: Union[int, float] + ) -> Union[int, float]: + """Subtract second number from first.""" + result = a - b + self.history.append(f"{a} - {b} = {result}") + return result + + def multiply( + self, a: Union[int, float], b: Union[int, float] + ) -> Union[int, float]: + """Multiply two numbers.""" + result = a * b + self.history.append(f"{a} * {b} = {result}") + return result + + def divide( + self, a: Union[int, float], b: Union[int, float] + ) -> Union[int, float]: + """Divide first number by second.""" + if b == 0: + raise ValueError("Cannot divide by zero") + result = a / b + self.history.append(f"{a} / {b} = {result}") + return result + + def power( + self, base: Union[int, float], exponent: Union[int, float] + ) -> Union[int, float]: + """Raise base to the power of exponent.""" + result = base**exponent + self.history.append(f"{base} ** {exponent} = {result}") + return result + + def clear_history(self) -> None: + """Clear calculation history.""" + self.history.clear() + + def get_history(self) -> list[str]: + """Get calculation history.""" + return self.history.copy() + + +def factorial(n: int) -> int: + """Calculate factorial of n.""" + if n < 0: + raise ValueError("Factorial is not defined for negative numbers") + if n == 0 or n == 1: + return 1 + return n * factorial(n - 1) + + +def is_prime(n: int) -> bool: + """Check if a number is prime.""" + if n < 2: + return False + if n == 2: + return True + if n % 2 == 0: + return False + + for i in range(3, int(n**0.5) + 1, 2): + if n % i == 0: + return False + return True diff --git a/qualityflow/examples/toy_lib/string_utils.py b/qualityflow/examples/toy_lib/string_utils.py new file mode 100644 index 00000000..276509ab --- /dev/null +++ b/qualityflow/examples/toy_lib/string_utils.py @@ -0,0 +1,122 @@ +""" +String utility functions for QualityFlow demonstration. +""" + +import re +from typing import List + + +def reverse_string(s: str) -> str: + """Reverse a string.""" + if not isinstance(s, str): + raise TypeError("Input must be a string") + return s[::-1] + + +def is_palindrome(s: str, ignore_case: bool = True) -> bool: + """Check if a string is a palindrome.""" + if not isinstance(s, str): + raise TypeError("Input must be a string") + + # Clean the string - keep only alphanumeric characters + cleaned = re.sub(r"[^a-zA-Z0-9]", "", s) + + if ignore_case: + cleaned = cleaned.lower() + + return cleaned == cleaned[::-1] + + +def count_words(text: str) -> int: + """Count words in text.""" + if not isinstance(text, str): + raise TypeError("Input must be a string") + + if not text.strip(): + return 0 + + words = text.split() + return len(words) + + +def capitalize_words(text: str) -> str: + """Capitalize the first letter of each word.""" + if not isinstance(text, str): + raise TypeError("Input must be a string") + + return " ".join(word.capitalize() for word in text.split()) + + +def extract_emails(text: str) -> List[str]: + """Extract email addresses from text.""" + if not isinstance(text, str): + raise TypeError("Input must be a string") + + email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b" + return re.findall(email_pattern, text) + + +def truncate_string(s: str, max_length: int, suffix: str = "...") -> str: + """Truncate string to maximum length with suffix.""" + if not isinstance(s, str): + raise TypeError("Input must be a string") + if not isinstance(max_length, int) or max_length < 0: + raise ValueError("max_length must be a non-negative integer") + + if len(s) <= max_length: + return s + + if max_length <= len(suffix): + return s[:max_length] + + return s[: max_length - len(suffix)] + suffix + + +class TextProcessor: + """Text processing utility class.""" + + def __init__(self, default_encoding: str = "utf-8"): + self.default_encoding = default_encoding + self.processed_count = 0 + + def clean_text(self, text: str, remove_punctuation: bool = False) -> str: + """Clean text by removing extra whitespace and optionally punctuation.""" + if not isinstance(text, str): + raise TypeError("Input must be a string") + + # Remove extra whitespace + cleaned = " ".join(text.split()) + + if remove_punctuation: + # Remove punctuation except spaces + cleaned = re.sub(r"[^\w\s]", "", cleaned) + + self.processed_count += 1 + return cleaned + + def word_frequency( + self, text: str, ignore_case: bool = True + ) -> dict[str, int]: + """Count word frequency in text.""" + if not isinstance(text, str): + raise TypeError("Input must be a string") + + words = text.split() + if ignore_case: + words = [word.lower() for word in words] + + frequency = {} + for word in words: + # Remove punctuation from word + clean_word = re.sub(r"[^\w]", "", word) + if clean_word: + frequency[clean_word] = frequency.get(clean_word, 0) + 1 + + return frequency + + def get_stats(self) -> dict[str, int]: + """Get processing statistics.""" + return { + "processed_count": self.processed_count, + "default_encoding": self.default_encoding, + } diff --git a/qualityflow/pipelines/__init__.py b/qualityflow/pipelines/__init__.py new file mode 100644 index 00000000..af93c207 --- /dev/null +++ b/qualityflow/pipelines/__init__.py @@ -0,0 +1,5 @@ +"""QualityFlow pipelines.""" + +from .generate_and_evaluate import generate_and_evaluate + +__all__ = ["generate_and_evaluate"] diff --git a/qualityflow/pipelines/generate_and_evaluate.py b/qualityflow/pipelines/generate_and_evaluate.py new file mode 100644 index 00000000..7050b5bd --- /dev/null +++ b/qualityflow/pipelines/generate_and_evaluate.py @@ -0,0 +1,60 @@ +""" +QualityFlow experiment pipeline for test generation and evaluation. +""" + +from steps.analyze_code import analyze_code +from steps.fetch_source import fetch_source +from steps.gen_tests_agent import gen_tests_agent +from steps.gen_tests_baseline import gen_tests_baseline +from steps.report import report +from steps.run_tests import run_tests +from steps.select_input import select_input +from zenml import pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline(name="generate_and_evaluate") +def generate_and_evaluate() -> None: + """QualityFlow pipeline for generating and evaluating tests. + + Simple, focused pipeline: + 1. Analyze code to find files needing tests + 2. Generate tests using LLM and baseline approaches + 3. Run tests and measure coverage + 4. Report results for comparison + """ + # Step 1: Resolve source specification + spec = select_input() + + # Step 2: Fetch and materialize workspace + workspace_dir, commit_sha = fetch_source(spec) + + # Step 3: Analyze and select code files + code_summary = analyze_code(workspace_dir, commit_sha, spec) + + # Step 4: Generate tests using LLM agent + agent_tests_dir, test_summary = gen_tests_agent( + workspace_dir, code_summary + ) + + # Step 5: Generate baseline tests (optional) + baseline_tests_dir = gen_tests_baseline(workspace_dir, code_summary) + + # Step 6: Run agent tests + agent_results = run_tests(workspace_dir, agent_tests_dir, label="agent") + + # Step 7: Run baseline tests (if available) + baseline_results = run_tests( + workspace_dir, baseline_tests_dir, label="baseline" + ) + + # Step 8: Generate comprehensive report (includes evaluation) + report( + workspace_dir, + commit_sha, + test_summary, + agent_results, + baseline_results, + ) diff --git a/qualityflow/prompts/unit_test_strict_v2.jinja b/qualityflow/prompts/unit_test_strict_v2.jinja new file mode 100644 index 00000000..32dd2643 --- /dev/null +++ b/qualityflow/prompts/unit_test_strict_v2.jinja @@ -0,0 +1,99 @@ +# Unit Test Generation Prompt v2.0 (Strict) +# Comprehensive test generation with advanced patterns + +You are a senior Python test engineer with expertise in test-driven development. Generate production-grade unit tests with comprehensive coverage. + +## Code Analysis +- **File**: `{{ file_path }}` +- **Complexity Score**: {{ complexity_score }} +- **Target Test Count**: {{ max_tests }} + +## Source Code +```python +{{ source_code }} +``` + +## Advanced Testing Requirements + +Generate {{ max_tests }} comprehensive tests covering ALL of the following: + +### 1. Functional Coverage +- **Happy paths**: Normal operation scenarios +- **Edge cases**: Boundary values, empty collections, extreme inputs +- **Error handling**: Exception paths, invalid states +- **State transitions**: Object lifecycle, state changes + +### 2. Quality Patterns +- **Arrange-Act-Assert** structure +- **Given-When-Then** scenarios +- **Property-based testing** where applicable +- **Parameterized tests** for multiple scenarios + +### 3. Advanced Techniques +- **Mock interactions**: Verify call patterns, not just return values +- **Context managers**: Test resource cleanup +- **Async/await**: If code contains async patterns +- **Thread safety**: If code has concurrency +- **Performance bounds**: Basic timing assertions + +### 4. Security Considerations +- **Input sanitization**: SQL injection, XSS prevention +- **Authorization**: Access control validation +- **Data exposure**: Sensitive information leakage + +## Technical Requirements + +- Use `pytest` with fixtures and parametrization +- Implement proper test isolation +- Include integration test patterns where relevant +- Use `hypothesis` for property-based tests when beneficial +- Mock all external dependencies (filesystem, network, databases) +- Test both success and failure scenarios thoroughly + +## Output Format + +Provide production-ready test code: + +```python +""" +Comprehensive unit tests for {{ file_path }} +Generated by QualityFlow (Strict Mode) +Coverage target: >95% line and branch coverage +""" + +import pytest +import unittest +from unittest.mock import Mock, patch, MagicMock, call +from hypothesis import given, strategies as st +import tempfile +import os +from contextlib import contextmanager + +# Import the module under test +# from {{ file_path.replace('/', '.').replace('.py', '') }} import * + +class Test{{ file_path.split('/')[-1].replace('.py', '').title() }}(unittest.TestCase): + """Comprehensive test suite for {{ file_path }}.""" + + def setUp(self): + """Set up test fixtures and mock objects.""" + pass + + def tearDown(self): + """Clean up after tests.""" + pass + + # Generated test methods with comprehensive coverage + + @pytest.mark.parametrize("input,expected", [ + # Add parameterized test cases + ]) + def test_parametrized_scenarios(self, input, expected): + """Test multiple scenarios with parameterization.""" + pass + +if __name__ == "__main__": + unittest.main() +``` + +Focus on realistic, maintainable tests that would pass code review in a production environment. \ No newline at end of file diff --git a/qualityflow/prompts/unit_test_v1.jinja b/qualityflow/prompts/unit_test_v1.jinja new file mode 100644 index 00000000..1c1cd444 --- /dev/null +++ b/qualityflow/prompts/unit_test_v1.jinja @@ -0,0 +1,61 @@ +# Unit Test Generation Prompt v1.0 +# Standard test generation for QualityFlow + +You are an expert Python test engineer. Generate comprehensive unit tests for the following code. + +## Code Analysis +- **File**: `{{ file_path }}` +- **Complexity Score**: {{ complexity_score }} +- **Target Test Count**: {{ max_tests }} + +## Source Code +```python +{{ source_code }} +``` + +## Instructions + +Generate {{ max_tests }} high-quality unit tests that cover: +1. **Happy path scenarios** - typical usage patterns +2. **Edge cases** - boundary conditions, empty inputs, None values +3. **Error conditions** - invalid inputs, exceptions +4. **Integration points** - mocked dependencies where applicable + +## Requirements + +- Use `pytest` and `unittest.TestCase` patterns +- Include proper docstrings for test methods +- Use `unittest.mock` for external dependencies +- Focus on behavioral testing, not implementation details +- Ensure tests are deterministic and repeatable +- Include setup/teardown if needed + +## Output Format + +Provide only the Python test code with no additional explanation: + +```python +""" +Unit tests for {{ file_path }} +Generated by QualityFlow +""" + +import pytest +import unittest +from unittest.mock import Mock, patch, MagicMock + +# Import the module under test +# from {{ file_path.replace('/', '.').replace('.py', '') }} import * + +class TestModule(unittest.TestCase): + """Test suite for {{ file_path }}.""" + + def setUp(self): + """Set up test fixtures.""" + pass + + # Your generated test methods here + +if __name__ == "__main__": + unittest.main() +``` \ No newline at end of file diff --git a/qualityflow/requirements.txt b/qualityflow/requirements.txt new file mode 100644 index 00000000..2d3f977e --- /dev/null +++ b/qualityflow/requirements.txt @@ -0,0 +1,22 @@ +# ZenML and Core MLOps +zenml>=0.84.2 + +# Core Python Libraries +pydantic>=2.0.0,<3.0.0 +pyyaml>=6.0,<7.0 +jinja2>=3.0.0,<4.0.0 + +# Testing Framework +pytest>=7.0.0,<8.0.0 +pytest-cov>=4.0.0,<5.0.0 +coverage>=7.0.0,<8.0.0 + +# Code Analysis +# ast is built-in, no need to install + +# Git Integration +gitpython>=3.1.0,<4.0.0 + +# LLM Integration (optional) +openai>=1.0.0,<2.0.0 # for OpenAI provider +anthropic>=0.25.0,<1.0.0 # for Anthropic provider \ No newline at end of file diff --git a/qualityflow/run.py b/qualityflow/run.py new file mode 100644 index 00000000..b4a9c513 --- /dev/null +++ b/qualityflow/run.py @@ -0,0 +1,62 @@ +""" +Entry point for running QualityFlow test generation pipeline. +""" + +from pathlib import Path +from typing import Union + +import click +from pipelines import generate_and_evaluate +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@click.command() +@click.option( + "--config", + "-c", + type=click.Path(exists=True, dir_okay=False), + default=None, + required=False, + help="Path to configuration YAML file. Defaults to configs/experiment.default.yaml", +) +@click.option( + "--no-cache", + is_flag=True, + default=False, + help="Disable pipeline caching and force fresh execution", +) +def main(config: Union[str, None], no_cache: bool): + """Run QualityFlow test generation and coverage analysis pipeline. + + Simple pipeline that generates tests using LLM, runs them, measures coverage, + and compares results against baseline approaches. + """ + + try: + project_root = Path(__file__).resolve().parent + default_config = project_root / "configs" / "experiment.default.yaml" + except Exception: + # Fallback to current working directory + default_config = Path.cwd() / "configs" / "experiment.default.yaml" + + chosen_config = config or str(default_config) + + try: + logger.info( + f"Starting QualityFlow pipeline with config: {chosen_config}" + ) + pipeline_instance = generate_and_evaluate.with_options( + config_path=chosen_config, enable_cache=not no_cache + ) + pipeline_instance() + logger.info("QualityFlow pipeline completed successfully!") + + except Exception as e: + logger.error(f"Pipeline execution failed: {e}") + raise + + +if __name__ == "__main__": + main() diff --git a/qualityflow/steps/__init__.py b/qualityflow/steps/__init__.py new file mode 100644 index 00000000..c6857ec7 --- /dev/null +++ b/qualityflow/steps/__init__.py @@ -0,0 +1,21 @@ +"""QualityFlow pipeline steps.""" + +from .analyze_code import analyze_code +from .evaluate_coverage import evaluate_coverage +from .fetch_source import fetch_source +from .gen_tests_agent import gen_tests_agent +from .gen_tests_baseline import gen_tests_baseline +from .report import report +from .run_tests import run_tests +from .select_input import select_input + +__all__ = [ + "select_input", + "fetch_source", + "analyze_code", + "gen_tests_agent", + "gen_tests_baseline", + "run_tests", + "evaluate_coverage", + "report", +] diff --git a/qualityflow/steps/analyze_code.py b/qualityflow/steps/analyze_code.py new file mode 100644 index 00000000..7cc5822c --- /dev/null +++ b/qualityflow/steps/analyze_code.py @@ -0,0 +1,167 @@ +""" +Analyze and select code files for test generation. +""" + +import ast +import glob +import os +from enum import Enum +from pathlib import Path +from typing import Annotated, Dict, List + +from zenml import step +from zenml.logger import get_logger + + +class SelectionStrategy(str, Enum): + """Code file selection strategies.""" + + LOW_COVERAGE = "low_coverage" + CHANGED_FILES = "changed_files" + ALL = "all" + + +logger = get_logger(__name__) + + +@step +def analyze_code( + workspace_dir: Path, + commit_sha: str, + source_spec: Dict[str, str], + strategy: SelectionStrategy = SelectionStrategy.LOW_COVERAGE, + max_files: int = 10, +) -> Annotated[Dict, "code_summary"]: + """ + Analyze workspace and select candidate files for test generation. + + Args: + workspace_dir: Path to workspace directory + commit_sha: Git commit SHA + source_spec: Source specification containing target_glob and other settings + strategy: File selection strategy + max_files: Maximum number of files to select + + Returns: + Code summary dictionary containing selected files and metadata + """ + # Extract target_glob from source spec + target_glob = source_spec.get("target_glob", "src/**/*.py") + + logger.info( + f"Analyzing code in {workspace_dir} with strategy {strategy} and glob {target_glob}" + ) + + workspace_path = Path(workspace_dir) + + # Find all Python files matching glob pattern + all_files = [] + for pattern in target_glob.split(","): + pattern = pattern.strip() + matched_files = glob.glob( + str(workspace_path / pattern), recursive=True + ) + all_files.extend(matched_files) + + # Make paths relative to workspace + relative_files = [ + os.path.relpath(f, workspace_dir) + for f in all_files + if f.endswith(".py") and os.path.isfile(f) + ] + + logger.info(f"Found {len(relative_files)} Python files") + + # Calculate complexity scores + complexity_scores = {} + valid_files = [] + + for file_path in relative_files: + full_path = workspace_path / file_path + try: + with open(full_path, "r", encoding="utf-8") as f: + content = f.read() + + # Parse AST and calculate basic complexity + tree = ast.parse(content) + complexity = _calculate_complexity(tree) + complexity_scores[file_path] = complexity + valid_files.append(file_path) + + except (SyntaxError, UnicodeDecodeError) as e: + logger.warning(f"Skipping {file_path} due to parsing error: {e}") + continue + + # Select files based on strategy + selected_files = _select_files( + valid_files, complexity_scores, strategy, max_files + ) + + code_summary = { + "selected_files": selected_files, + "total_files": len(valid_files), + "selection_reason": f"Selected top {len(selected_files)} files using {strategy} strategy", + "complexity_scores": {f: complexity_scores[f] for f in selected_files}, + } + + logger.info(f"Selected {len(selected_files)} files: {selected_files}") + + return code_summary + + +def _calculate_complexity(tree: ast.AST) -> float: + """Calculate basic complexity score for an AST.""" + + class ComplexityVisitor(ast.NodeVisitor): + def __init__(self): + self.complexity = 0 + self.functions = 0 + self.classes = 0 + + def visit_FunctionDef(self, node): + self.functions += 1 + self.complexity += 1 + for child in ast.walk(node): + if isinstance(child, (ast.If, ast.For, ast.While, ast.Try)): + self.complexity += 1 + self.generic_visit(node) + + def visit_ClassDef(self, node): + self.classes += 1 + self.complexity += 1 + self.generic_visit(node) + + visitor = ComplexityVisitor() + visitor.visit(tree) + + # Combine metrics into single score + return visitor.complexity + visitor.functions * 0.5 + visitor.classes * 2 + + +def _select_files( + files: List[str], + complexity_scores: Dict[str, float], + strategy: SelectionStrategy, + max_files: int, +) -> List[str]: + """Select files based on strategy.""" + + if strategy == SelectionStrategy.ALL: + return files[:max_files] + + elif strategy == SelectionStrategy.LOW_COVERAGE: + # Prioritize complex files that likely need more tests + sorted_files = sorted( + files, key=lambda f: complexity_scores[f], reverse=True + ) + return sorted_files[:max_files] + + elif strategy == SelectionStrategy.CHANGED_FILES: + # For this demo, just return all files (in real implementation, would use git diff) + logger.warning( + "CHANGED_FILES strategy not fully implemented, falling back to ALL" + ) + return files[:max_files] + + else: + raise ValueError(f"Unknown selection strategy: {strategy}") diff --git a/qualityflow/steps/evaluate_coverage.py b/qualityflow/steps/evaluate_coverage.py new file mode 100644 index 00000000..f069bfb5 --- /dev/null +++ b/qualityflow/steps/evaluate_coverage.py @@ -0,0 +1,87 @@ +""" +Evaluate coverage metrics and compare against baselines. +""" + +from typing import Annotated, Dict, Optional + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def evaluate_coverage( + agent_results: Dict, + baseline_results: Optional[Dict], + commit_sha: str, +) -> Annotated[Dict, "evaluation_metrics"]: + """ + Evaluate coverage metrics and compare agent vs baseline approaches. + + Args: + agent_results: Test results from agent-generated tests + baseline_results: Test results from baseline tests (optional) + commit_sha: Current commit SHA + + Returns: + Evaluation metrics dictionary with coverage comparison + """ + logger.info("Evaluating coverage metrics and computing deltas") + + # Extract agent metrics + coverage_total_agent = agent_results.get("coverage_total", 0.0) + tests_passed_agent = agent_results.get("tests_passed", 0) + tests_failed_agent = agent_results.get("tests_failed", 0) + + total_tests_agent = tests_passed_agent + tests_failed_agent + pass_rate_agent = ( + tests_passed_agent / total_tests_agent + if total_tests_agent > 0 + else 0.0 + ) + + # Extract baseline metrics + coverage_total_baseline = None + if baseline_results and not baseline_results.get("skipped", False): + coverage_total_baseline = baseline_results.get("coverage_total", 0.0) + + # Compare agent vs baseline coverage + coverage_improvement = 0.0 + if coverage_total_baseline is not None: + coverage_improvement = coverage_total_agent - coverage_total_baseline + + # Analyze coverage quality + pass_rate_quality = ( + "excellent" + if pass_rate_agent > 0.95 + else "good" + if pass_rate_agent > 0.8 + else "needs_improvement" + ) + coverage_quality = ( + "excellent" + if coverage_total_agent > 80 + else "good" + if coverage_total_agent > 50 + else "needs_improvement" + ) + + evaluation_metrics = { + "coverage_total_agent": coverage_total_agent, + "coverage_total_baseline": coverage_total_baseline, + "coverage_improvement": coverage_improvement, + "tests_passed_agent": tests_passed_agent, + "tests_failed_agent": tests_failed_agent, + "pass_rate_agent": pass_rate_agent, + "pass_rate_quality": pass_rate_quality, + "coverage_quality": coverage_quality, + "commit_sha": commit_sha, + "files_analyzed": len(agent_results.get("coverage_by_file", {})), + } + + logger.info( + f"Evaluation complete: agent_coverage={coverage_total_agent:.2f}%, baseline_coverage={coverage_total_baseline or 0:.2f}%, improvement={coverage_improvement:+.2f}%" + ) + + return evaluation_metrics diff --git a/qualityflow/steps/fetch_source.py b/qualityflow/steps/fetch_source.py new file mode 100644 index 00000000..cdf37548 --- /dev/null +++ b/qualityflow/steps/fetch_source.py @@ -0,0 +1,129 @@ +""" +Fetch source code workspace step. + +This module provides functionality to clone Git repositories and prepare +workspaces for code analysis and test generation. +""" + +import subprocess +import tempfile +from pathlib import Path +from typing import Annotated, Dict, Tuple + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def fetch_source( + source_spec: Dict[str, str], +) -> Tuple[Annotated[Path, "workspace_dir"], Annotated[str, "commit_sha"]]: + """ + Fetch and materialize workspace from git repository or use local examples. + + Args: + source_spec: Source specification from select_input step + + Returns: + Tuple of workspace directory path and commit SHA + """ + repo_url = source_spec["repo_url"] + ref = source_spec["ref"] + + # Handle local examples case + if repo_url == "local": + logger.info("Using local QualityFlow examples") + try: + # Get the project root (QualityFlow directory) + current_file = Path(__file__).resolve() + project_root = ( + current_file.parent.parent + ) # Go up from steps/ to project root + + # Create temporary workspace and copy examples + workspace_dir = tempfile.mkdtemp( + prefix="qualityflow_local_workspace_" + ) + workspace_path = Path(workspace_dir) + + # Copy examples directory to the temporary workspace + import shutil + + examples_src = project_root / "examples" + examples_dest = workspace_path / "examples" + + if examples_src.exists(): + shutil.copytree(examples_src, examples_dest) + logger.info( + f"Copied examples from {examples_src} to {examples_dest}" + ) + else: + logger.warning( + f"Examples directory not found at {examples_src}" + ) + + commit_sha = "local-examples" + logger.info(f"Local workspace ready at {workspace_path}") + return workspace_path, commit_sha + + except Exception as e: + logger.error(f"Failed to set up local workspace: {e}") + # Fallback to current working directory + workspace_dir = tempfile.mkdtemp( + prefix="qualityflow_fallback_workspace_" + ) + return Path(workspace_dir), "local-fallback" + + logger.info(f"Fetching source from {repo_url}@{ref}") + + # Create temporary workspace for remote repositories + workspace_dir = tempfile.mkdtemp(prefix="qualityflow_workspace_") + workspace_path = Path(workspace_dir) + + try: + # Clone repository + logger.info(f"Cloning {repo_url} to {workspace_dir}") + subprocess.run( + [ + "git", + "clone", + "--depth", + "1", + "--branch", + ref, + repo_url, + workspace_dir, + ], + check=True, + capture_output=True, + text=True, + ) + + # Get commit SHA + result = subprocess.run( + ["git", "rev-parse", "HEAD"], + cwd=workspace_dir, + check=True, + capture_output=True, + text=True, + ) + commit_sha = result.stdout.strip() + + logger.info( + f"Workspace ready at {workspace_dir}, commit: {commit_sha}" + ) + + return Path(workspace_dir), commit_sha + + except subprocess.CalledProcessError as e: + logger.error(f"Failed to fetch source: {e}") + raise RuntimeError(f"Git operation failed: {e.stderr}") + except Exception as e: + logger.error(f"Unexpected error fetching source: {e}") + # Clean up on error + import shutil + + shutil.rmtree(workspace_dir, ignore_errors=True) + raise diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py new file mode 100644 index 00000000..de879731 --- /dev/null +++ b/qualityflow/steps/gen_tests_agent.py @@ -0,0 +1,521 @@ +""" +Generate tests using LLM agent. +""" + +import tempfile +from enum import Enum +from pathlib import Path +from typing import Annotated, Dict, Tuple + +from jinja2 import Template +from zenml import log_metadata, step +from zenml.logger import get_logger +from zenml.types import MarkdownString + + +class GenerationProvider(str, Enum): + """LLM providers for test generation.""" + + OPENAI = "openai" + ANTHROPIC = "anthropic" + FAKE = "fake" + + +logger = get_logger(__name__) + + +@step +def gen_tests_agent( + workspace_dir: Path, + code_summary: Dict, + provider: GenerationProvider = GenerationProvider.FAKE, + model: str = "gpt-4o-mini", + prompt_path: str = "prompts/unit_test_v1.jinja", + max_tests_per_file: int = 3, + max_files: int = 10, +) -> Tuple[ + Annotated[Path, "agent_tests_dir"], + Annotated[MarkdownString, "test_summary"], +]: + """Generate tests using LLM agent. + + Args: + workspace_dir: Path to workspace directory + code_summary: Code analysis summary containing selected files + provider: LLM provider to use + model: Model name + prompt_path: Path to Jinja2 prompt template + max_tests_per_file: Maximum tests to generate per file + max_files: Maximum number of files to process (for speed control) + + Returns: + Tuple of test directory and test generation summary + """ + # Extract selected files from code summary + selected_files = code_summary.get("selected_files", []) + + # Limit files if max_files is specified + files_to_process = ( + selected_files[:max_files] if max_files > 0 else selected_files + ) + logger.info( + f"Generating tests for {len(files_to_process)}/{len(selected_files)} files using {provider}:{model}" + ) + + # Create tests directory + tests_dir = tempfile.mkdtemp(prefix="qualityflow_agent_tests_") + tests_path = Path(tests_dir) + + # Load prompt template from QualityFlow project directory + # Note: workspace_dir is the cloned repo, but prompts are in QualityFlow project + try: + # Try to resolve project root more robustly + current_file = Path(__file__).resolve() + project_root = ( + current_file.parent.parent + ) # Go up from steps/ to project root + prompt_file = project_root / prompt_path + except Exception: + # Fallback to current working directory if path resolution fails + prompt_file = Path.cwd() / prompt_path + + if prompt_file.exists(): + with open(prompt_file, "r") as f: + prompt_template = f.read() + logger.info(f"Loaded prompt template from {prompt_file}") + else: + # Use default template if file doesn't exist + prompt_template = _get_default_prompt_template() + logger.info( + f"Using default prompt template, {prompt_path} not found at {prompt_file}" + ) + + template = Template(prompt_template) + + # Keep workspace_path for reading source files + workspace_path = Path(workspace_dir) + + total_tokens_in = 0 + total_tokens_out = 0 + test_snippets = {} # Store test snippets per file + test_stats = {} # Store test statistics per file + + for file_path in files_to_process: + logger.info(f"Generating tests for {file_path}") + + # Read source file + full_file_path = workspace_path / file_path + with open(full_file_path, "r") as f: + source_code = f.read() + + # Render prompt + materialized_prompt = template.render( + file_path=file_path, + source_code=source_code, + max_tests=max_tests_per_file, + complexity_score=code_summary.get("complexity_scores", {}).get( + file_path, 0 + ), + ) + + # Store test generation info for this file + test_stats[file_path] = { + "provider": provider.value, + "model": model, + "max_tests": max_tests_per_file, + "complexity_score": code_summary.get("complexity_scores", {}).get( + file_path, 0 + ), + } + + # Generate tests using provider + if provider == GenerationProvider.FAKE: + generated_tests, tokens = _generate_fake_tests( + file_path, source_code, max_tests_per_file + ) + elif provider == GenerationProvider.OPENAI: + generated_tests, tokens = _generate_openai_tests( + materialized_prompt, model + ) + elif provider == GenerationProvider.ANTHROPIC: + generated_tests, tokens = _generate_anthropic_tests( + materialized_prompt, model + ) + else: + raise ValueError(f"Unsupported provider: {provider}") + + total_tokens_in += tokens.get("tokens_in", 0) + total_tokens_out += tokens.get("tokens_out", 0) + + # Save generated tests + test_file_name = f"test_{Path(file_path).stem}.py" + test_file_path = tests_path / test_file_name + + with open(test_file_path, "w") as f: + f.write(generated_tests) + + # Store test snippet for summary (first 20 lines) + test_lines = generated_tests.split("\n") + snippet_lines = test_lines[:20] + if len(test_lines) > 20: + snippet_lines.append("... (truncated)") + test_snippets[file_path] = "\n".join(snippet_lines) + + # Update test stats with actual counts + test_stats[file_path]["lines_generated"] = len(test_lines) + test_stats[file_path]["test_functions"] = len( + [ + line + for line in test_lines + if line.strip().startswith("def test_") + ] + ) + + logger.info(f"Generated tests saved to {test_file_path}") + + # Log comprehensive metadata + metadata = { + "token_usage": { + "tokens_in": total_tokens_in, + "tokens_out": total_tokens_out, + "cost_estimate": _estimate_cost( + total_tokens_in, total_tokens_out, provider, model + ), + }, + "config": { + "provider": provider.value, + "model": model, + "prompt_template_path": prompt_path, + "max_tests_per_file": max_tests_per_file, + "files_processed": len(files_to_process), + }, + "test_stats": test_stats, + } + + log_metadata(metadata) + logger.info( + f"Test generation complete. Files: {len(files_to_process)}, Tokens: {total_tokens_in} in / {total_tokens_out} out" + ) + + # Create test generation summary + test_summary = _create_test_summary( + provider, + model, + prompt_path, + files_to_process, + test_snippets, + test_stats, + total_tokens_in, + total_tokens_out, + ) + + # Return Path object - ZenML will automatically materialize the folder + return Path(tests_dir), test_summary + + +def _create_test_summary( + provider: GenerationProvider, + model: str, + prompt_path: str, + files_processed: list, + test_snippets: Dict[str, str], + test_stats: Dict[str, Dict], + total_tokens_in: int, + total_tokens_out: int, +) -> MarkdownString: + """Create a markdown summary of test generation results.""" + + # Calculate totals + total_lines = sum( + stats.get("lines_generated", 0) for stats in test_stats.values() + ) + total_test_functions = sum( + stats.get("test_functions", 0) for stats in test_stats.values() + ) + + # Handle edge case of no files processed + if len(files_processed) == 0: + summary = f"""# ๐Ÿงช Test Generation Summary + +## Configuration +- **Provider**: {provider.value} +- **Model**: {model} +- **Prompt Template**: {prompt_path} +- **Files Processed**: 0 + +## Generation Statistics +โš ๏ธ **No files were processed for test generation.** + +This could happen if: +- No files matched the target glob pattern +- All files were filtered out during analysis +- Max files limit was set to 0 + +**Token Usage**: {total_tokens_in:,} in / {total_tokens_out:,} out +""" + return MarkdownString(summary) + + # Build markdown content for successful processing + avg_tests = total_test_functions / len(files_processed) + summary = f"""# ๐Ÿงช Test Generation Summary + +## Configuration +- **Provider**: {provider.value} +- **Model**: {model} +- **Prompt Template**: {prompt_path} +- **Files Processed**: {len(files_processed)} + +## Generation Statistics +- **Total Lines Generated**: {total_lines:,} +- **Total Test Functions**: {total_test_functions} +- **Average Tests per File**: {avg_tests:.1f} +- **Token Usage**: {total_tokens_in:,} in / {total_tokens_out:,} out + +## Generated Tests by File + +""" + + for file_path in files_processed: + stats = test_stats.get(file_path, {}) + snippet = test_snippets.get(file_path, "") + + complexity = stats.get("complexity_score", 0) + lines = stats.get("lines_generated", 0) + test_count = stats.get("test_functions", 0) + + summary += f"""### ๐Ÿ“„ `{file_path}` +**Complexity Score**: {complexity:.1f} | **Lines**: {lines} | **Test Functions**: {test_count} + +``` +{snippet} +``` + +--- + +""" + + return MarkdownString(summary) + + +def _get_default_prompt_template() -> str: + """Default Jinja2 prompt template for test generation.""" + return """# Generate unit tests for the following Python code + +File: {{ file_path }} +Complexity Score: {{ complexity_score }} +Max Tests: {{ max_tests }} + +## Source Code: +```python +{{ source_code }} +``` + +## Instructions: +Generate {{ max_tests }} comprehensive unit tests for the functions and classes in this code. +Focus on edge cases, error conditions, and typical usage patterns. + +## Generated Tests: +""" + + +def _generate_fake_tests( + file_path: str, source_code: str, max_tests: int +) -> Tuple[str, Dict]: + """Generate fake/mock tests for development/testing.""" + test_content = f'''""" +Generated tests for {file_path} +""" + +import pytest +import unittest +from unittest.mock import Mock, patch, MagicMock + +class Test{file_path.split("/")[-1].replace(".py", "").title()}(unittest.TestCase): + """Auto-generated test class for {file_path}.""" + + def test_module_import(self): + """Test that we can at least validate the test framework.""" + # Simple test that always passes to ensure test discovery works + self.assertTrue(True) + + def test_basic_functionality(self): + """Test basic functionality.""" + # Mock test demonstrating test execution + result = 1 + 1 + self.assertEqual(result, 2) + + def test_error_handling(self): + """Test error handling.""" + # Test exception handling + with self.assertRaises(ValueError): + raise ValueError("Expected test exception") + + def test_mock_usage(self): + """Test mock functionality.""" + # Test using mocks + mock_obj = Mock() + mock_obj.method.return_value = "mocked_result" + result = mock_obj.method() + self.assertEqual(result, "mocked_result") + + def test_coverage_target(self): + """Test that generates some coverage.""" + # Simple operations to generate coverage + data = {{"key": "value"}} + self.assertIn("key", data) + + items = [1, 2, 3, 4, 5] + filtered = [x for x in items if x > 3] + self.assertEqual(len(filtered), 2) + +if __name__ == "__main__": + unittest.main() +''' + + tokens = {"tokens_in": 100, "tokens_out": 50} + return test_content, tokens + + +def _generate_openai_tests(prompt: str, model: str) -> Tuple[str, Dict]: + """Generate tests using OpenAI API.""" + try: + import os + + import openai + + # Get API key from environment + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + logger.warning("OPENAI_API_KEY not found, using fake tests") + return _generate_fake_tests("openai_file", "mock_code", 3) + + client = openai.OpenAI(api_key=api_key) + + # Call OpenAI API + response = client.chat.completions.create( + model=model, + messages=[ + { + "role": "system", + "content": "You are a Python test generation expert. Generate comprehensive unit tests for the given code.", + }, + {"role": "user", "content": prompt}, + ], + max_tokens=2000, + temperature=0.1, + ) + + # Extract test code from response + generated_content = response.choices[0].message.content + + # Try to extract Python code blocks + if "```python" in generated_content: + start = generated_content.find("```python") + 9 + end = generated_content.find("```", start) + test_content = generated_content[start:end].strip() + elif "```" in generated_content: + start = generated_content.find("```") + 3 + end = generated_content.find("```", start) + test_content = generated_content[start:end].strip() + else: + # Use the whole response if no code blocks found + test_content = generated_content.strip() + + # Token usage for cost estimation + tokens = { + "tokens_in": response.usage.prompt_tokens, + "tokens_out": response.usage.completion_tokens, + } + + logger.info( + f"Generated tests using OpenAI {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out" + ) + return test_content, tokens + + except ImportError: + logger.warning("OpenAI library not installed, using fake tests") + return _generate_fake_tests("openai_file", "mock_code", 3) + except Exception as e: + logger.error(f"Failed to generate tests with OpenAI: {e}") + logger.warning("Falling back to fake tests") + return _generate_fake_tests("openai_file", "mock_code", 3) + + +def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]: + """Generate tests using Anthropic API.""" + try: + import os + + import anthropic + + # Get API key from environment + api_key = os.getenv("ANTHROPIC_API_KEY") + if not api_key: + logger.warning("ANTHROPIC_API_KEY not found, using fake tests") + return _generate_fake_tests("anthropic_file", "mock_code", 3) + + client = anthropic.Anthropic(api_key=api_key) + + # Call Anthropic API + response = client.messages.create( + model=model, + max_tokens=2000, + temperature=0.1, + messages=[ + { + "role": "user", + "content": f"You are a Python test generation expert. Generate comprehensive unit tests for the given code.\n\n{prompt}", + } + ], + ) + + # Extract test content from response + generated_content = response.content[0].text + + # Try to extract Python code blocks + if "```python" in generated_content: + start = generated_content.find("```python") + 9 + end = generated_content.find("```", start) + test_content = generated_content[start:end].strip() + elif "```" in generated_content: + start = generated_content.find("```") + 3 + end = generated_content.find("```", start) + test_content = generated_content[start:end].strip() + else: + # Use the whole response if no code blocks found + test_content = generated_content.strip() + + # Token usage for cost estimation + tokens = { + "tokens_in": response.usage.input_tokens, + "tokens_out": response.usage.output_tokens, + } + + logger.info( + f"Generated tests using Anthropic {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out" + ) + return test_content, tokens + + except ImportError: + logger.warning("Anthropic library not installed, using fake tests") + return _generate_fake_tests("anthropic_file", "mock_code", 3) + except Exception as e: + logger.error(f"Failed to generate tests with Anthropic: {e}") + logger.warning("Falling back to fake tests") + return _generate_fake_tests("anthropic_file", "mock_code", 3) + + +def _estimate_cost( + tokens_in: int, tokens_out: int, provider: GenerationProvider, model: str +) -> float: + """Estimate cost based on token usage.""" + # Rough cost estimates (would need real pricing) + if provider == GenerationProvider.OPENAI: + if "gpt-4" in model: + return (tokens_in * 0.00003) + (tokens_out * 0.00006) + else: # gpt-3.5 + return (tokens_in * 0.0000015) + (tokens_out * 0.000002) + elif provider == GenerationProvider.ANTHROPIC: + return (tokens_in * 0.000008) + (tokens_out * 0.000024) + else: + return 0.0 diff --git a/qualityflow/steps/gen_tests_baseline.py b/qualityflow/steps/gen_tests_baseline.py new file mode 100644 index 00000000..db712e81 --- /dev/null +++ b/qualityflow/steps/gen_tests_baseline.py @@ -0,0 +1,204 @@ +""" +Generate baseline/skeleton tests using heuristics. + +This module creates simple test templates by analyzing Python AST to identify +functions and classes, generating skeleton test code for comparison with +LLM-generated tests. +""" + +import ast +import tempfile +from pathlib import Path +from typing import Annotated, Dict, List, Optional + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def gen_tests_baseline( + workspace_dir: Path, + code_summary: Dict, + enabled: bool = True, + max_files: int = 10, +) -> Annotated[Optional[Path], "baseline_tests_dir"]: + """ + Generate baseline/skeleton tests using heuristic analysis. + + Args: + workspace_dir: Path to workspace directory + code_summary: Code analysis summary containing selected files + enabled: Whether baseline generation is enabled + max_files: Maximum number of files to process + + Returns: + Path to baseline tests directory, or None if disabled + """ + if not enabled: + logger.info("Baseline test generation disabled") + return None + + # Extract selected files from code summary + selected_files = code_summary.get("selected_files", []) + + # Limit files if max_files is specified + files_to_process = ( + selected_files[:max_files] if max_files > 0 else selected_files + ) + logger.info( + f"Generating baseline tests for {len(files_to_process)}/{len(selected_files)} files" + ) + + # Create baseline tests directory + tests_dir = tempfile.mkdtemp(prefix="qualityflow_baseline_tests_") + tests_path = Path(tests_dir) + + workspace_path = Path(workspace_dir) + + for file_path in files_to_process: + logger.info(f"Generating baseline tests for {file_path}") + + # Read and parse source file + full_file_path = workspace_path / file_path + with open(full_file_path, "r") as f: + source_code = f.read() + + try: + tree = ast.parse(source_code) + + # Extract functions and classes + functions, classes = _extract_testable_items(tree) + + # Generate skeleton tests + test_content = _generate_skeleton_tests( + file_path, functions, classes + ) + + # Save baseline tests + test_file_name = f"test_{Path(file_path).stem}_baseline.py" + test_file_path = tests_path / test_file_name + + with open(test_file_path, "w") as f: + f.write(test_content) + + logger.info(f"Baseline tests saved to {test_file_path}") + + except SyntaxError as e: + logger.warning(f"Skipping {file_path} due to syntax error: {e}") + continue + + logger.info("Baseline test generation complete") + + # Return Path object - ZenML will automatically materialize the folder + return Path(tests_dir) + + +def _extract_testable_items(tree: ast.AST) -> tuple[List[str], List[str]]: + """Extract function and class names from AST.""" + functions = [] + classes = [] + + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef): + # Skip private functions (starting with _) + if not node.name.startswith("_"): + functions.append(node.name) + elif isinstance(node, ast.ClassDef): + # Skip private classes + if not node.name.startswith("_"): + classes.append(node.name) + + return functions, classes + + +def _generate_skeleton_tests( + file_path: str, functions: List[str], classes: List[str] +) -> str: + """Generate skeleton test content.""" + + # Create imports section + imports = f'''""" +Baseline/skeleton tests for {file_path} +Generated using heuristic analysis. +""" + +import pytest +import unittest +from unittest.mock import Mock, patch +''' + + # Try to determine import path from file path + module_path = file_path.replace("/", ".").replace(".py", "") + if module_path.startswith("src."): + module_path = module_path[4:] # Remove 'src.' prefix + + if functions or classes: + imports += ( + f"# from {module_path} import {', '.join(functions + classes)}\n\n" + ) + else: + imports += f"# from {module_path} import *\n\n" + + # Generate function tests + function_tests = "" + for func_name in functions: + function_tests += f''' +def test_{func_name}_basic(): + """Basic test for {func_name}.""" + # TODO: Implement test for {func_name} + pass + +def test_{func_name}_error_cases(): + """Error case test for {func_name}.""" + # TODO: Test error conditions for {func_name} + pass +''' + + # Generate class tests + class_tests = "" + for class_name in classes: + class_tests += f''' +class Test{class_name}(unittest.TestCase): + """Test suite for {class_name}.""" + + def setUp(self): + """Set up test fixtures.""" + # TODO: Initialize test fixtures + pass + + def test_{class_name.lower()}_init(self): + """Test {class_name} initialization.""" + # TODO: Test class initialization + pass + + def test_{class_name.lower()}_methods(self): + """Test {class_name} methods.""" + # TODO: Test class methods + pass +''' + + # Add default test if no functions or classes found + if not functions and not classes: + default_test = ''' +class TestModule(unittest.TestCase): + """Default test suite for module.""" + + def test_module_imports(self): + """Test that module can be imported.""" + # TODO: Add import test + pass +''' + class_tests += default_test + + # Combine all parts + test_content = imports + function_tests + class_tests + + # Add main block + test_content += """ +if __name__ == "__main__": + unittest.main() +""" + + return test_content diff --git a/qualityflow/steps/report.py b/qualityflow/steps/report.py new file mode 100644 index 00000000..ab8e564d --- /dev/null +++ b/qualityflow/steps/report.py @@ -0,0 +1,261 @@ +""" +Generate comprehensive pipeline report. + +This module creates detailed markdown reports comparing LLM-generated tests +against baseline tests, including coverage metrics, quality assessments, +and recommendations for improvement. +""" + +import tempfile +from datetime import datetime +from pathlib import Path +from typing import Annotated, Dict, Optional + +from zenml import step +from zenml.logger import get_logger +from zenml.types import MarkdownString + +logger = get_logger(__name__) + + +@step +def report( + workspace_dir: Path, + commit_sha: str, + test_summary: MarkdownString, + agent_results: Dict, + baseline_results: Optional[Dict], +) -> Annotated[MarkdownString, "final_report"]: + """ + Generate comprehensive markdown report for pipeline execution. + + Args: + workspace_dir: Workspace directory path + commit_sha: Git commit SHA + test_summary: Test generation summary with snippets + agent_results: Agent test results + baseline_results: Baseline test results (optional) + + Returns: + Markdown report as string + """ + logger.info("Generating pipeline execution report") + + # Create report file + report_file = ( + Path(tempfile.mkdtemp(prefix="qualityflow_report_")) / "report.md" + ) + + # Evaluate coverage metrics first + evaluation_metrics = _evaluate_coverage_metrics( + agent_results, baseline_results, commit_sha + ) + + # Generate report content + report_content = _generate_report_content( + workspace_dir, + commit_sha, + test_summary, + agent_results, + baseline_results, + evaluation_metrics, + ) + + # Write report file + with open(report_file, "w") as f: + f.write(report_content) + + logger.info(f"Report generated: {report_file}") + + # Return as MarkdownString for dashboard visualization + return MarkdownString(report_content) + + +def _evaluate_coverage_metrics( + agent_results: Dict, + baseline_results: Optional[Dict], + commit_sha: str, +) -> Dict: + """Evaluate coverage metrics and compare agent vs baseline approaches.""" + + # Extract agent metrics - use actual values from test results + coverage_total_agent = agent_results.get("coverage_total", 0.0) + tests_passed_agent = agent_results.get("tests_passed", 0) + tests_failed_agent = agent_results.get("tests_failed", 0) + + total_tests_agent = tests_passed_agent + tests_failed_agent + pass_rate_agent = ( + tests_passed_agent / total_tests_agent + if total_tests_agent > 0 + else 0.0 + ) + + # Extract baseline metrics + coverage_total_baseline = 0.0 + if baseline_results and not baseline_results.get("skipped", False): + coverage_total_baseline = baseline_results.get("coverage_total", 0.0) + + # Compare agent vs baseline coverage + coverage_improvement = coverage_total_agent - coverage_total_baseline + + # Analyze coverage quality + pass_rate_quality = ( + "excellent" + if pass_rate_agent > 0.95 + else "good" + if pass_rate_agent > 0.8 + else "needs_improvement" + ) + coverage_quality = ( + "excellent" + if coverage_total_agent > 80 + else "good" + if coverage_total_agent > 50 + else "needs_improvement" + ) + + return { + "coverage_total_agent": coverage_total_agent, + "coverage_total_baseline": coverage_total_baseline, + "coverage_improvement": coverage_improvement, + "tests_passed_agent": tests_passed_agent, + "tests_failed_agent": tests_failed_agent, + "pass_rate_agent": pass_rate_agent, + "pass_rate_quality": pass_rate_quality, + "coverage_quality": coverage_quality, + "commit_sha": commit_sha, + "files_analyzed": len(agent_results.get("coverage_by_file", {})), + } + + +def _generate_report_content( + workspace_dir: Path, + commit_sha: str, + test_summary: MarkdownString, + agent_results: Dict, + baseline_results: Optional[Dict], + evaluation_metrics: Dict, +) -> str: + """Generate markdown report content.""" + + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # Header + report = f"""# QualityFlow Pipeline Report + +Generated: {timestamp} +Commit: `{commit_sha}` +Workspace: `{workspace_dir}` + +## Executive Summary + +""" + + # Executive summary + coverage_agent = evaluation_metrics.get("coverage_total_agent", 0.0) + coverage_baseline = evaluation_metrics.get("coverage_total_baseline", 0.0) + improvement = evaluation_metrics.get("coverage_improvement", 0.0) + quality = evaluation_metrics.get("coverage_quality", "unknown") + + quality_emoji = ( + "๐ŸŸข" if quality == "excellent" else "๐ŸŸก" if quality == "good" else "๐Ÿ”ด" + ) + improvement_emoji = ( + "๐Ÿ“ˆ" if improvement > 0 else "๐Ÿ“‰" if improvement < 0 else "โžก๏ธ" + ) + + report += f"""{quality_emoji} **Coverage Quality**: {quality.upper()} +{improvement_emoji} **Agent vs Baseline**: {coverage_agent:.2f}% vs {coverage_baseline:.2f}% ({improvement:+.2f}%) +๐Ÿงช **Tests**: {agent_results.get("tests_passed", 0)} passed, {agent_results.get("tests_failed", 0)} failed +๐Ÿ“ **Files**: {evaluation_metrics.get("files_analyzed", 0)} analyzed + +""" + + # Agent results section + report += """## Agent Test Results + +""" + + if agent_results.get("skipped", False): + report += "Agent tests were skipped.\n\n" + else: + report += f"""- **Tests Passed**: {agent_results.get("tests_passed", 0)} +- **Tests Failed**: {agent_results.get("tests_failed", 0)} +- **Pass Rate**: {evaluation_metrics.get("pass_rate_agent", 0.0):.1%} +- **Coverage**: {agent_results.get("coverage_total", 0.0):.2f}% +- **JUnit Report**: `{agent_results.get("junit_path", "N/A")}` +- **Coverage Report**: `{agent_results.get("coverage_path", "N/A")}` +- **Logs**: `{agent_results.get("logs_path", "N/A")}` + +""" + + # Baseline results section (if available) + if baseline_results and not baseline_results.get("skipped", False): + report += """## Baseline Test Results + +""" + report += f"""- **Tests Passed**: {baseline_results.get("tests_passed", 0)} +- **Tests Failed**: {baseline_results.get("tests_failed", 0)} +- **Coverage**: {baseline_results.get("coverage_total", 0.0):.2f}% +- **JUnit Report**: `{baseline_results.get("junit_path", "N/A")}` +- **Coverage Report**: `{baseline_results.get("coverage_path", "N/A")}` + +""" + + # Evaluation metrics section + report += """## Coverage Analysis + +""" + + pass_rate = evaluation_metrics.get("pass_rate_agent", 0.0) + pass_quality = evaluation_metrics.get("pass_rate_quality", "unknown") + + report += f"""- **Agent Coverage**: {coverage_agent:.2f}% ({quality}) +- **Baseline Coverage**: {coverage_baseline:.2f}% +- **Improvement**: {improvement:+.2f}% +- **Test Pass Rate**: {pass_rate:.1%} ({pass_quality}) +- **Files Analyzed**: {evaluation_metrics.get("files_analyzed", 0)} + +""" + + # Recommendations section + report += """## Recommendations + +""" + if quality == "excellent": + report += "๐ŸŽ‰ **Excellent coverage!** Consider this approach for production use.\n" + elif quality == "good": + report += "๐Ÿ‘ **Good coverage.** Consider tweaking prompts or selection strategy for improvement.\n" + else: + report += "โš ๏ธ **Coverage needs improvement.** Try different prompts, models, or increase max_tests_per_file.\n" + + if improvement > 5: + report += "๐Ÿ“ˆ **Agent significantly outperforms baseline** - LLM approach is working well.\n" + elif improvement > 0: + report += "๐Ÿ“Š **Agent slightly better than baseline** - room for optimization.\n" + else: + report += "๐Ÿ“‰ **Baseline performs as well or better** - review agent configuration.\n" + + # Test generation details section + report += f"""## Test Generation Details + +{test_summary} + +### File Coverage Details +""" + + coverage_by_file = agent_results.get("coverage_by_file", {}) + if coverage_by_file: + report += "| File | Coverage |\n|------|----------|\n" + for file_path, coverage_pct in sorted(coverage_by_file.items()): + report += f"| `{file_path}` | {coverage_pct:.1f}% |\n" + else: + report += "No file-level coverage data available.\n" + + report += """ + +--- +*Generated by QualityFlow - Production-ready test generation with ZenML* +""" + + return report diff --git a/qualityflow/steps/run_tests.py b/qualityflow/steps/run_tests.py new file mode 100644 index 00000000..4ad3edc9 --- /dev/null +++ b/qualityflow/steps/run_tests.py @@ -0,0 +1,278 @@ +""" +Run tests and collect coverage metrics. +""" + +import shutil +import subprocess +import tempfile +from pathlib import Path +from typing import Annotated, Dict, Optional + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def run_tests( + workspace_dir: Path, + tests_dir: Optional[Path], + label: str = "tests", +) -> Annotated[Dict, "test_results"]: + """Run tests and collect coverage metrics. + + Args: + workspace_dir: Path to workspace directory + tests_dir: Path object to tests directory (None if no tests) + label: Label for this test run + + Returns: + Dictionary containing test results and metrics + """ + if tests_dir is None: + logger.info(f"No tests directory provided for {label}, skipping") + return { + "label": label, + "tests_passed": 0, + "tests_failed": 0, + "coverage_total": 0.0, + "coverage_by_file": {}, + "junit_path": None, + "coverage_path": None, + "logs_path": None, + "skipped": True, + } + + logger.info(f"Running {label} tests from {tests_dir}") + + # Create output directory for this test run + output_dir = tempfile.mkdtemp(prefix=f"qualityflow_{label}_results_") + output_path = Path(output_dir) + + junit_file = output_path / "junit.xml" + coverage_file = output_path / "coverage.xml" + logs_file = output_path / "test_logs.txt" + + # Copy tests to workspace (pytest needs them in PYTHONPATH) + workspace_tests_dir = Path(workspace_dir) / f"tests_{label}" + if workspace_tests_dir.exists(): + shutil.rmtree(workspace_tests_dir) + shutil.copytree(tests_dir, workspace_tests_dir) + + try: + # Run pytest with coverage + pytest_cmd = [ + "python", + "-m", + "pytest", + str(workspace_tests_dir), + "--junitxml", + str(junit_file), + "--cov", + str(workspace_dir), + "--cov-report", + f"xml:{coverage_file}", + "--cov-report", + "term", + "-v", + ] + + logger.info(f"Running command: {' '.join(pytest_cmd)}") + logger.info(f"Working directory: {workspace_dir}") + logger.info(f"Test directory: {workspace_tests_dir}") + + # Debug: list test files + if workspace_tests_dir.exists(): + test_files = list(workspace_tests_dir.glob("*.py")) + logger.info(f"Test files found: {[f.name for f in test_files]}") + else: + logger.warning( + f"Test directory does not exist: {workspace_tests_dir}" + ) + + result = subprocess.run( + pytest_cmd, + cwd=str(workspace_dir), + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + ) + + # Save logs and also log to console for debugging + with open(logs_file, "w") as f: + f.write(f"Command: {' '.join(pytest_cmd)}\n") + f.write(f"Return code: {result.returncode}\n\n") + f.write("STDOUT:\n") + f.write(result.stdout) + f.write("\nSTDERR:\n") + f.write(result.stderr) + + # Also log the pytest output for debugging + logger.info(f"Pytest return code: {result.returncode}") + if result.stdout: + logger.info(f"Pytest stdout: {result.stdout}") + if result.stderr: + logger.info(f"Pytest stderr: {result.stderr}") + + # Parse results + test_results = _parse_test_results( + result, junit_file, coverage_file, logs_file, label + ) + + logger.info( + f"Test run complete for {label}: {test_results['tests_passed']} passed, {test_results['tests_failed']} failed, {test_results['coverage_total']:.2f}% coverage" + ) + + return test_results + + except subprocess.TimeoutExpired: + logger.error(f"Test run for {label} timed out after 5 minutes") + return { + "label": label, + "tests_passed": 0, + "tests_failed": 1, + "coverage_total": 0.0, + "coverage_by_file": {}, + "junit_path": str(junit_file) if junit_file.exists() else None, + "coverage_path": str(coverage_file) + if coverage_file.exists() + else None, + "logs_path": str(logs_file), + "error": "Test execution timed out", + } + + except Exception as e: + logger.error(f"Failed to run tests for {label}: {e}") + return { + "label": label, + "tests_passed": 0, + "tests_failed": 1, + "coverage_total": 0.0, + "coverage_by_file": {}, + "junit_path": str(junit_file) if junit_file.exists() else None, + "coverage_path": str(coverage_file) + if coverage_file.exists() + else None, + "logs_path": str(logs_file) if logs_file.exists() else None, + "error": str(e), + } + + finally: + # Clean up copied tests + if workspace_tests_dir.exists(): + shutil.rmtree(workspace_tests_dir, ignore_errors=True) + + +def _parse_test_results( + result: subprocess.CompletedProcess, + junit_file: Path, + coverage_file: Path, + logs_file: Path, + label: str, +) -> Dict: + """Parse test execution results.""" + + # Parse pytest output for basic stats + tests_passed = 0 + tests_failed = 0 + + if result.stdout: + lines = result.stdout.split("\n") + for line in lines: + if " passed" in line and " failed" in line: + # Line like "2 failed, 3 passed in 1.23s" + parts = line.split() + for i, part in enumerate(parts): + if part == "passed" and i > 0: + tests_passed = int(parts[i - 1]) + elif part == "failed" and i > 0: + tests_failed = int(parts[i - 1]) + elif " passed" in line and "failed" not in line: + # Line like "5 passed in 1.23s" + parts = line.split() + for i, part in enumerate(parts): + if part == "passed" and i > 0: + tests_passed = int(parts[i - 1]) + + # Parse coverage from XML if available + coverage_total = 0.0 + coverage_by_file = {} + + if coverage_file.exists(): + coverage_total, coverage_by_file = _parse_coverage_xml(coverage_file) + + return { + "label": label, + "tests_passed": tests_passed, + "tests_failed": tests_failed, + "coverage_total": coverage_total, + "coverage_by_file": coverage_by_file, + "junit_path": str(junit_file) if junit_file.exists() else None, + "coverage_path": str(coverage_file) + if coverage_file.exists() + else None, + "logs_path": str(logs_file), + "return_code": result.returncode, + } + + +def _parse_coverage_xml(coverage_file: Path) -> tuple[float, Dict[str, float]]: + """Parse coverage XML file.""" + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(coverage_file) + root = tree.getroot() + + # Debug: log the XML structure + logger.info(f"Coverage XML root tag: {root.tag}") + logger.info(f"Coverage XML root attribs: {root.attrib}") + + # Get overall coverage - try different formats + coverage_total = 0.0 + + # Modern pytest-cov uses 'coverage' as root element + if root.tag == "coverage": + line_rate = root.get("line-rate", "0") + if line_rate != "0": + coverage_total = float(line_rate) * 100 + logger.info(f"Found line-rate in coverage root: {line_rate}") + else: + # Try finding coverage element nested + coverage_element = root.find(".//coverage") + if coverage_element is not None: + line_rate = coverage_element.get("line-rate", "0") + coverage_total = float(line_rate) * 100 + logger.info( + f"Found coverage element with line-rate: {line_rate}" + ) + + # If still no coverage found, try branches-valid attribute (alternative format) + if coverage_total == 0.0: + lines_valid = root.get("lines-valid", "0") + lines_covered = root.get("lines-covered", "0") + + if lines_valid != "0": + line_coverage = float(lines_covered) / float(lines_valid) + coverage_total = line_coverage * 100 + logger.info( + f"Calculated coverage from lines: {lines_covered}/{lines_valid} = {coverage_total:.2f}%" + ) + + # Get per-file coverage + coverage_by_file = {} + for class_elem in root.findall(".//class"): + filename = class_elem.get("filename", "") + line_rate = class_elem.get("line-rate", "0") + if filename: + coverage_by_file[filename] = float(line_rate) * 100 + + logger.info( + f"Parsed coverage: {coverage_total}% total, {len(coverage_by_file)} files" + ) + return coverage_total, coverage_by_file + + except Exception as e: + logger.warning(f"Failed to parse coverage XML: {e}") + return 0.0, {} diff --git a/qualityflow/steps/select_input.py b/qualityflow/steps/select_input.py new file mode 100644 index 00000000..ff16e391 --- /dev/null +++ b/qualityflow/steps/select_input.py @@ -0,0 +1,39 @@ +""" +Select input source specification step. +""" + +from typing import Annotated, Dict + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def select_input( + repo_url: str = "https://github.com/psf/requests.git", + ref: str = "main", + target_glob: str = "src/**/*.py", +) -> Annotated[Dict[str, str], "source_spec"]: + """ + Resolve source specification for test generation. + + Args: + repo_url: Repository URL to analyze + ref: Git reference (branch, tag, commit) + target_glob: Glob pattern for target files + + Returns: + Source specification dictionary + """ + logger.info(f"Selecting input source: {repo_url}@{ref}") + + spec = { + "repo_url": repo_url, + "ref": ref, + "target_glob": target_glob, + } + + logger.info(f"Source spec: {spec}") + return spec