From e0fac67e2c3e46d0c39203f3912887af2597d3bc Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sun, 24 Aug 2025 22:29:49 +0200 Subject: [PATCH 1/8] Added new projecg --- qualityflow/README.md | 358 ++++++++++++++++++ qualityflow/configs/experiment.default.yaml | 41 ++ qualityflow/configs/experiment.strict.yaml | 42 ++ qualityflow/examples/toy_lib/__init__.py | 5 + qualityflow/examples/toy_lib/calculator.py | 75 ++++ qualityflow/examples/toy_lib/string_utils.py | 120 ++++++ qualityflow/pipelines/__init__.py | 5 + .../pipelines/generate_and_evaluate.py | 63 +++ qualityflow/prompts/unit_test_strict_v2.jinja | 99 +++++ qualityflow/prompts/unit_test_v1.jinja | 61 +++ qualityflow/requirements.txt | 22 ++ qualityflow/run.py | 55 +++ qualityflow/steps/__init__.py | 21 + qualityflow/steps/analyze_code.py | 151 ++++++++ qualityflow/steps/evaluate_coverage.py | 68 ++++ qualityflow/steps/fetch_source.py | 70 ++++ qualityflow/steps/gen_tests_agent.py | 358 ++++++++++++++++++ qualityflow/steps/gen_tests_baseline.py | 190 ++++++++++ qualityflow/steps/report.py | 238 ++++++++++++ qualityflow/steps/run_tests.py | 258 +++++++++++++ qualityflow/steps/select_input.py | 38 ++ 21 files changed, 2338 insertions(+) create mode 100644 qualityflow/README.md create mode 100644 qualityflow/configs/experiment.default.yaml create mode 100644 qualityflow/configs/experiment.strict.yaml create mode 100644 qualityflow/examples/toy_lib/__init__.py create mode 100644 qualityflow/examples/toy_lib/calculator.py create mode 100644 qualityflow/examples/toy_lib/string_utils.py create mode 100644 qualityflow/pipelines/__init__.py create mode 100644 qualityflow/pipelines/generate_and_evaluate.py create mode 100644 qualityflow/prompts/unit_test_strict_v2.jinja create mode 100644 qualityflow/prompts/unit_test_v1.jinja create mode 100644 qualityflow/requirements.txt create mode 100644 qualityflow/run.py create mode 100644 qualityflow/steps/__init__.py create mode 100644 qualityflow/steps/analyze_code.py create mode 100644 qualityflow/steps/evaluate_coverage.py create mode 100644 qualityflow/steps/fetch_source.py create mode 100644 qualityflow/steps/gen_tests_agent.py create mode 100644 qualityflow/steps/gen_tests_baseline.py create mode 100644 qualityflow/steps/report.py create mode 100644 qualityflow/steps/run_tests.py create mode 100644 qualityflow/steps/select_input.py diff --git a/qualityflow/README.md b/qualityflow/README.md new file mode 100644 index 00000000..490e9aa2 --- /dev/null +++ b/qualityflow/README.md @@ -0,0 +1,358 @@ +# ๐Ÿงช QualityFlow: AI-Powered Test Generation Pipeline + +A streamlined MLOps pipeline for **automated test generation** using ZenML and LLMs. Generate comprehensive unit tests for your codebase, compare different approaches, and get detailed coverage analysis. + +## ๐Ÿš€ Product Overview + +QualityFlow demonstrates how to build production-ready MLOps workflows for automated test generation using Large Language Models. Built with ZenML, it provides a simple yet powerful pipeline for generating and evaluating AI-generated tests. + +**Focus**: **LLM-Powered Test Generation** and **Coverage Analysis**. + +### Key Features + +- **Real LLM Integration**: OpenAI and Anthropic providers for intelligent test generation +- **Smart File Selection**: Configurable strategies to focus on files that need testing +- **Baseline Comparison**: Compare LLM-generated tests vs heuristic baseline tests +- **Coverage Analysis**: Real coverage metrics with detailed reporting +- **Speed Controls**: `max_files` parameters to control pipeline execution time +- **Containerized Ready**: Uses ZenML Path artifacts for remote execution +- **Cost Tracking**: Token usage and cost estimation with metadata logging + +## ๐Ÿ’ก How It Works + +### โœˆ๏ธ Pipeline Architecture + +QualityFlow consists of a single, focused pipeline: + +#### Generate & Evaluate Pipeline + +The main pipeline handles the complete test generation workflow: + +1. **Source Selection** - Specify repository and target files +2. **Code Fetching** - Clone and materialize workspace +3. **Code Analysis** - Select files for testing (with max_files limit) +4. **LLM Test Generation** - Generate tests using OpenAI/Anthropic/fake providers +5. **Baseline Generation** - Create simple heuristic tests for comparison +6. **Test Execution** - Run both test suites with coverage analysis +7. **Report Generation** - Compare results and generate markdown reports + +### ๐Ÿ”ง Architecture + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ Git Repo โ”‚ โ”‚ LLM Providers โ”‚ โ”‚ Test Reports โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ src/**/*.py โ”‚โ”€โ”€โ”€โ”€โ”‚โ–ถ OpenAI/Claude โ”‚โ”€โ”€โ”€โ”€โ”‚โ–ถ Coverage โ”‚ +โ”‚ target files โ”‚ โ”‚ Fake (testing) โ”‚ โ”‚ Comparisons โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ Cost Tracking โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ โ–ฒ + โ–ผ โ”‚ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ QualityFlow Pipeline โ”‚ +โ”‚ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ Generate & Evaluate โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ 1. Select Input โ†’ 2. Fetch Source โ†’ 3. Analyze โ”‚ โ”‚ +โ”‚ โ”‚ 4. Generate (LLM) โ†’ 5. Generate (Base) โ†’ 6. Run Tests โ”‚ โ”‚ +โ”‚ โ”‚ 7. Run Tests โ†’ 8. Report & Compare โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ Features: max_files control, Path artifacts, metadata โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +## ๐Ÿ“ฆ Quick Start + +### Prerequisites + +- Python 3.9+ +- ZenML installed (`pip install zenml`) +- Git +- OpenAI API key (optional, can use fake provider) + +### Setup + +```bash +pip install -r requirements.txt +``` + +2. **Set up OpenAI (optional)**: +```bash +export OPENAI_API_KEY="your-api-key-here" +``` + +3. **Run the pipeline**: +```bash +python run.py +``` + +That's it! The pipeline will: +- Clone the configured repository (default: requests library) +- Analyze Python files and select candidates +- Generate tests using OpenAI (or fake provider if no API key) +- Run tests and measure coverage +- Generate a comprehensive report comparing approaches + +## โš™๏ธ Configuration + +### Key Parameters + +You can customize the pipeline behavior by editing `configs/experiment.default.yaml`: + +```yaml +# Control execution speed +steps: + analyze_code: + parameters: + max_files: 3 # Limit files to analyze (faster execution) + + gen_tests_agent: + parameters: + provider: "openai" # openai | anthropic | fake + model: "gpt-4o-mini" + max_files: 2 # Limit files for test generation + max_tests_per_file: 3 + + gen_tests_baseline: + parameters: + max_files: 2 # Match agent for fair comparison +``` + +### Pipeline Options + +```bash +# Use fake provider (no API key needed) +python run.py # Uses config defaults + +# Force fresh execution (no caching) +python run.py --no-cache + +# Use different config +python run.py --config configs/experiment.strict.yaml +``` + +## ๐Ÿ”ฌ Advanced Usage + +### Different Target Repositories + +Edit the config to point to your own repository: + +```yaml +steps: + select_input: + parameters: + repo_url: "https://github.com/your-org/your-repo.git" + ref: "main" + target_glob: "src/**/*.py" # Adjust path pattern +``` + +### Custom Prompts + +Create new Jinja2 templates in `prompts/`: + +```jinja2 +# prompts/custom_test_v3.jinja + +Generate {{ max_tests }} tests for: +{{ file_path }} (complexity: {{ complexity_score }}) + +Source: +```python +{{ source_code }} +``` + +Requirements: +- Use pytest fixtures +- Include edge cases +- Mock external dependencies +``` + +### A/B Testing Experiments + +Use run templates for systematic comparisons: + +```bash +# Compare prompt versions +python scripts/run_experiment.py --config configs/experiment.default.yaml +python scripts/run_experiment.py --config configs/experiment.strict.yaml + +# Compare in ZenML dashboard: +# - Coverage metrics +# - Test quality scores +# - Token usage and cost +# - Promotion decisions +``` + +### Production Deployment + +Set up ZenML stack for cloud deployment: + +```bash +# Example: AWS EKS stack +zenml artifact-store register s3_store --flavor=s3 --path=s3://your-bucket +zenml container-registry register ecr_registry --flavor=aws --uri=your-account.dkr.ecr.region.amazonaws.com +zenml orchestrator register k8s_orchestrator --flavor=kubernetes --kubernetes_context=your-eks-context + +zenml stack register production_stack \ + -a s3_store -c ecr_registry -o k8s_orchestrator --set +``` + +### Scheduled Regression + +Register batch regression for daily execution: + +```bash +python scripts/run_batch.py --config configs/schedule.batch.yaml --schedule +``` + +## ๐Ÿ—๏ธ Project Structure + +``` +qualityflow/ +โ”œโ”€โ”€ README.md +โ”œโ”€โ”€ pyproject.toml +โ”œโ”€โ”€ requirements.txt +โ”œโ”€โ”€ .env.example +โ”œโ”€โ”€ zenml.yaml +โ”‚ +โ”œโ”€โ”€ configs/ # Pipeline configurations +โ”‚ โ”œโ”€โ”€ experiment.default.yaml # Standard experiment settings +โ”‚ โ”œโ”€โ”€ experiment.strict.yaml # High-quality gates +โ”‚ โ””โ”€โ”€ schedule.batch.yaml # Batch regression schedule +โ”‚ +โ”œโ”€โ”€ domain/ # Core data models +โ”‚ โ”œโ”€โ”€ schema.py # Pydantic models +โ”‚ โ””โ”€โ”€ stages.py # Deployment stages +โ”‚ +โ”œโ”€โ”€ pipelines/ # Pipeline definitions +โ”‚ โ”œโ”€โ”€ generate_and_evaluate.py # Experiment pipeline +โ”‚ โ””โ”€โ”€ batch_regression.py # Scheduled regression +โ”‚ +โ”œโ”€โ”€ steps/ # Pipeline steps +โ”‚ โ”œโ”€โ”€ select_input.py # Source specification +โ”‚ โ”œโ”€โ”€ fetch_source.py # Repository fetching +โ”‚ โ”œโ”€โ”€ analyze_code.py # Code analysis & selection +โ”‚ โ”œโ”€โ”€ gen_tests_agent.py # LLM test generation +โ”‚ โ”œโ”€โ”€ gen_tests_baseline.py # Heuristic test generation +โ”‚ โ”œโ”€โ”€ run_tests.py # Test execution & coverage +โ”‚ โ”œโ”€โ”€ evaluate_coverage.py # Metrics & gate evaluation +โ”‚ โ”œโ”€โ”€ compare_and_promote.py # Model registry promotion +โ”‚ โ”œโ”€โ”€ resolve_test_pack.py # Test pack resolution +โ”‚ โ””โ”€โ”€ report.py # Report generation +โ”‚ +โ”œโ”€โ”€ prompts/ # Jinja2 prompt templates +โ”‚ โ”œโ”€โ”€ unit_test_v1.jinja # Standard test generation +โ”‚ โ””โ”€โ”€ unit_test_strict_v2.jinja # Comprehensive test generation +โ”‚ +โ”œโ”€โ”€ materializers/ # Custom artifact handling +โ”œโ”€โ”€ utils/ # Utility functions +โ”‚ +โ”œโ”€โ”€ registry/ # Test Pack registry docs +โ”‚ โ””โ”€โ”€ README.md +โ”‚ +โ”œโ”€โ”€ run_templates/ # Experiment templates +โ”‚ โ”œโ”€โ”€ ab_agent_vs_strict.json # A/B testing configuration +โ”‚ โ””โ”€โ”€ baseline_only.json # Baseline establishment +โ”‚ +โ”œโ”€โ”€ scripts/ # CLI scripts +โ”‚ โ”œโ”€โ”€ run_experiment.py # Experiment runner +โ”‚ โ””โ”€โ”€ run_batch.py # Batch regression runner +โ”‚ +โ””โ”€โ”€ examples/ # Demo code for testing + โ””โ”€โ”€ toy_lib/ # Sample library + โ”œโ”€โ”€ calculator.py + โ””โ”€โ”€ string_utils.py +``` + +### Key Components + +- **Domain Models**: Pydantic schemas for type safety and validation +- **Pipeline Steps**: Modular, reusable components with clear interfaces +- **Prompt Templates**: Jinja2 templates for LLM test generation +- **Configuration**: YAML-driven experiment and deployment settings +- **Quality Gates**: Configurable thresholds for coverage and promotion +- **Model Registry**: ZenML Model Registry integration for test pack versioning + +## ๐Ÿš€ Production Deployment + +### ZenML Cloud Stack Setup + +For production deployment with ZenML Cloud: + +```bash +# Connect to ZenML Cloud +zenml connect --url https://your-org.zenml.cloud + +# Register cloud stack components +zenml artifact-store register cloud_store --flavor=s3 --path=s3://qualityflow-artifacts +zenml orchestrator register cloud_k8s --flavor=kubernetes --kubernetes_context=prod-cluster + +zenml stack register production \ + -a cloud_store -o cloud_k8s --set +``` + +### Scheduled Execution + +Set up automated regression testing: + +```bash +# Register schedule (example with ZenML Cloud) +python scripts/run_batch.py --config configs/schedule.batch.yaml --schedule + +# Monitor via dashboard: +# - Daily regression results +# - Coverage trend analysis +# - Test pack performance +``` + +## ๐Ÿค Contributing + +QualityFlow follows ZenML best practices and is designed to be extended: + +1. **Add New LLM Providers**: Extend `gen_tests_agent.py` with new provider integrations +2. **Custom Materializers**: Create materializers for new artifact types +3. **Additional Metrics**: Expand evaluation capabilities with new quality metrics +4. **Selection Strategies**: Add new code selection algorithms + +## ๐Ÿ“ Next Steps + +After running QualityFlow successfully: + +1. **Explore ZenML Dashboard**: View pipeline runs, artifacts, and model registry +2. **Experiment with Prompts**: Try different test generation strategies +3. **Add Real Codebases**: Replace toy examples with your production code +4. **Deploy to Production**: Use cloud orchestration for scale +5. **Set Up Monitoring**: Configure alerts for regression detection + +## ๐Ÿ†˜ Troubleshooting + +### Common Issues + +**LLM API Errors**: +- Set `OPENAI_API_KEY` or `ANTHROPIC_API_KEY` environment variables +- Use `provider: "fake"` for development without API keys + +**Test Execution Failures**: +- Ensure pytest and coverage tools are installed +- Check that workspace has proper Python path setup + +### Debug Mode + +Run with debug logging: + +```bash +export ZENML_LOGGING_VERBOSITY=DEBUG +python scripts/run_experiment.py --config configs/experiment.default.yaml +``` + +## ๐Ÿ“š Resources + +- [ZenML Documentation](https://docs.zenml.io/) +- [Model Control Plane](https://docs.zenml.io/user-guide/model-control-plane) +- [Kubernetes Orchestrator](https://docs.zenml.io/stacks/stack-components/orchestrators/kubernetes) + +--- + +Built with โค๏ธ using [ZenML](https://zenml.io) - *The MLOps Framework for Production AI* \ No newline at end of file diff --git a/qualityflow/configs/experiment.default.yaml b/qualityflow/configs/experiment.default.yaml new file mode 100644 index 00000000..61537368 --- /dev/null +++ b/qualityflow/configs/experiment.default.yaml @@ -0,0 +1,41 @@ +# QualityFlow Default Experiment Configuration +# Production-ready template for automated test generation & validation + +# Pipeline configuration +name: "generate_and_evaluate" +version: "1.0" + +# Source configuration +steps: + select_input: + parameters: + repo_url: "https://github.com/psf/requests.git" + ref: "main" + target_glob: "src/**/*.py" + + analyze_code: + parameters: + strategy: "low_coverage" # low_coverage | changed_files | all + max_files: 3 # Reduced for faster testing + + # LLM generation configuration + gen_tests_agent: + parameters: + provider: "openai" # openai | anthropic | fake + model: "gpt-4o-mini" + prompt_path: "prompts/unit_test_v1.jinja" + max_tests_per_file: 3 + max_files: 2 # Limit files for faster testing + + # Baseline test generation + gen_tests_baseline: + parameters: + enabled: true + max_files: 2 # Match agent max_files for consistency + + # No more evaluation gates or promotion - just simple coverage comparison + +# Resource configuration +settings: + docker: + requirements: requirements.txt \ No newline at end of file diff --git a/qualityflow/configs/experiment.strict.yaml b/qualityflow/configs/experiment.strict.yaml new file mode 100644 index 00000000..8d1d15ba --- /dev/null +++ b/qualityflow/configs/experiment.strict.yaml @@ -0,0 +1,42 @@ +# QualityFlow Strict Experiment Configuration +# Higher quality gates and strict prompt for comprehensive testing + +# Pipeline configuration +name: "generate_and_evaluate" +version: "1.0" + +# Source configuration +steps: + select_input: + parameters: + repo_url: "https://github.com/psf/requests.git" + ref: "main" + target_glob: "src/**/*.py,tests/**/*.py" + + analyze_code: + parameters: + strategy: "low_coverage" + max_files: 5 # Fewer files for more thorough testing + + # LLM generation with strict prompt + gen_tests_agent: + parameters: + provider: "openai" # openai | anthropic | fake + model: "gpt-4o" # More powerful model + prompt_path: "prompts/unit_test_strict_v2.jinja" + max_tests_per_file: 5 # More tests per file + max_files: 5 # Match analyze_code for consistency + + # Baseline test generation + gen_tests_baseline: + parameters: + enabled: true + max_files: 5 # Match agent for fair comparison + +# Resource configuration with higher limits +settings: + docker: + requirements: requirements.txt + resource_settings: + memory: "4Gi" + cpu_count: 2.0 \ No newline at end of file diff --git a/qualityflow/examples/toy_lib/__init__.py b/qualityflow/examples/toy_lib/__init__.py new file mode 100644 index 00000000..c70599d5 --- /dev/null +++ b/qualityflow/examples/toy_lib/__init__.py @@ -0,0 +1,5 @@ +""" +QualityFlow toy library example for testing. +""" + +__version__ = "0.1.0" \ No newline at end of file diff --git a/qualityflow/examples/toy_lib/calculator.py b/qualityflow/examples/toy_lib/calculator.py new file mode 100644 index 00000000..c9ec644d --- /dev/null +++ b/qualityflow/examples/toy_lib/calculator.py @@ -0,0 +1,75 @@ +"""Simple calculator module for QualityFlow demonstration.""" + +from typing import Union + + +class Calculator: + """A simple calculator with basic arithmetic operations.""" + + def __init__(self): + """Initialize calculator with empty history.""" + self.history = [] + + def add(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]: + """Add two numbers.""" + result = a + b + self.history.append(f"{a} + {b} = {result}") + return result + + def subtract(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]: + """Subtract second number from first.""" + result = a - b + self.history.append(f"{a} - {b} = {result}") + return result + + def multiply(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]: + """Multiply two numbers.""" + result = a * b + self.history.append(f"{a} * {b} = {result}") + return result + + def divide(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]: + """Divide first number by second.""" + if b == 0: + raise ValueError("Cannot divide by zero") + result = a / b + self.history.append(f"{a} / {b} = {result}") + return result + + def power(self, base: Union[int, float], exponent: Union[int, float]) -> Union[int, float]: + """Raise base to the power of exponent.""" + result = base ** exponent + self.history.append(f"{base} ** {exponent} = {result}") + return result + + def clear_history(self) -> None: + """Clear calculation history.""" + self.history.clear() + + def get_history(self) -> list[str]: + """Get calculation history.""" + return self.history.copy() + + +def factorial(n: int) -> int: + """Calculate factorial of n.""" + if n < 0: + raise ValueError("Factorial is not defined for negative numbers") + if n == 0 or n == 1: + return 1 + return n * factorial(n - 1) + + +def is_prime(n: int) -> bool: + """Check if a number is prime.""" + if n < 2: + return False + if n == 2: + return True + if n % 2 == 0: + return False + + for i in range(3, int(n**0.5) + 1, 2): + if n % i == 0: + return False + return True \ No newline at end of file diff --git a/qualityflow/examples/toy_lib/string_utils.py b/qualityflow/examples/toy_lib/string_utils.py new file mode 100644 index 00000000..d842b500 --- /dev/null +++ b/qualityflow/examples/toy_lib/string_utils.py @@ -0,0 +1,120 @@ +""" +String utility functions for QualityFlow demonstration. +""" + +import re +from typing import List, Optional + + +def reverse_string(s: str) -> str: + """Reverse a string.""" + if not isinstance(s, str): + raise TypeError("Input must be a string") + return s[::-1] + + +def is_palindrome(s: str, ignore_case: bool = True) -> bool: + """Check if a string is a palindrome.""" + if not isinstance(s, str): + raise TypeError("Input must be a string") + + # Clean the string - keep only alphanumeric characters + cleaned = re.sub(r'[^a-zA-Z0-9]', '', s) + + if ignore_case: + cleaned = cleaned.lower() + + return cleaned == cleaned[::-1] + + +def count_words(text: str) -> int: + """Count words in text.""" + if not isinstance(text, str): + raise TypeError("Input must be a string") + + if not text.strip(): + return 0 + + words = text.split() + return len(words) + + +def capitalize_words(text: str) -> str: + """Capitalize the first letter of each word.""" + if not isinstance(text, str): + raise TypeError("Input must be a string") + + return ' '.join(word.capitalize() for word in text.split()) + + +def extract_emails(text: str) -> List[str]: + """Extract email addresses from text.""" + if not isinstance(text, str): + raise TypeError("Input must be a string") + + email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' + return re.findall(email_pattern, text) + + +def truncate_string(s: str, max_length: int, suffix: str = "...") -> str: + """Truncate string to maximum length with suffix.""" + if not isinstance(s, str): + raise TypeError("Input must be a string") + if not isinstance(max_length, int) or max_length < 0: + raise ValueError("max_length must be a non-negative integer") + + if len(s) <= max_length: + return s + + if max_length <= len(suffix): + return s[:max_length] + + return s[:max_length - len(suffix)] + suffix + + +class TextProcessor: + """Text processing utility class.""" + + def __init__(self, default_encoding: str = "utf-8"): + self.default_encoding = default_encoding + self.processed_count = 0 + + def clean_text(self, text: str, remove_punctuation: bool = False) -> str: + """Clean text by removing extra whitespace and optionally punctuation.""" + if not isinstance(text, str): + raise TypeError("Input must be a string") + + # Remove extra whitespace + cleaned = ' '.join(text.split()) + + if remove_punctuation: + # Remove punctuation except spaces + cleaned = re.sub(r'[^\w\s]', '', cleaned) + + self.processed_count += 1 + return cleaned + + def word_frequency(self, text: str, ignore_case: bool = True) -> dict[str, int]: + """Count word frequency in text.""" + if not isinstance(text, str): + raise TypeError("Input must be a string") + + words = text.split() + if ignore_case: + words = [word.lower() for word in words] + + frequency = {} + for word in words: + # Remove punctuation from word + clean_word = re.sub(r'[^\w]', '', word) + if clean_word: + frequency[clean_word] = frequency.get(clean_word, 0) + 1 + + return frequency + + def get_stats(self) -> dict[str, int]: + """Get processing statistics.""" + return { + "processed_count": self.processed_count, + "default_encoding": self.default_encoding + } \ No newline at end of file diff --git a/qualityflow/pipelines/__init__.py b/qualityflow/pipelines/__init__.py new file mode 100644 index 00000000..525d0f58 --- /dev/null +++ b/qualityflow/pipelines/__init__.py @@ -0,0 +1,5 @@ +"""QualityFlow pipelines.""" + +from .generate_and_evaluate import generate_and_evaluate + +__all__ = ["generate_and_evaluate"] \ No newline at end of file diff --git a/qualityflow/pipelines/generate_and_evaluate.py b/qualityflow/pipelines/generate_and_evaluate.py new file mode 100644 index 00000000..c50754e3 --- /dev/null +++ b/qualityflow/pipelines/generate_and_evaluate.py @@ -0,0 +1,63 @@ +""" +QualityFlow experiment pipeline for test generation and evaluation. +""" + +from typing import Annotated + +from zenml import pipeline +from zenml.logger import get_logger + +from steps.select_input import select_input +from steps.fetch_source import fetch_source +from steps.analyze_code import analyze_code +from steps.gen_tests_agent import gen_tests_agent +from steps.gen_tests_baseline import gen_tests_baseline +from steps.run_tests import run_tests +from steps.report import report + +logger = get_logger(__name__) + + +@pipeline(name="generate_and_evaluate") +def generate_and_evaluate() -> None: + """QualityFlow pipeline for generating and evaluating tests. + + Simple, focused pipeline: + 1. Analyze code to find files needing tests + 2. Generate tests using LLM and baseline approaches + 3. Run tests and measure coverage + 4. Report results for comparison + """ + # Step 1: Resolve source specification + spec = select_input() + + # Step 2: Fetch and materialize workspace + workspace_dir, commit_sha = fetch_source(spec) + + # Step 3: Analyze and select code files + code_summary = analyze_code( + workspace_dir, commit_sha + ) + + # Step 4: Generate tests using LLM agent + agent_tests_dir, prompt_used = gen_tests_agent( + workspace_dir, code_summary + ) + + # Step 5: Generate baseline tests (optional) + baseline_tests_dir = gen_tests_baseline(workspace_dir, code_summary) + + # Step 6: Run agent tests + agent_results = run_tests(workspace_dir, agent_tests_dir, label="agent") + + # Step 7: Run baseline tests (if available) + baseline_results = run_tests(workspace_dir, baseline_tests_dir, label="baseline") + + # Step 8: Generate comprehensive report (includes evaluation) + report( + workspace_dir, + commit_sha, + prompt_used, + agent_results, + baseline_results, + ) \ No newline at end of file diff --git a/qualityflow/prompts/unit_test_strict_v2.jinja b/qualityflow/prompts/unit_test_strict_v2.jinja new file mode 100644 index 00000000..32dd2643 --- /dev/null +++ b/qualityflow/prompts/unit_test_strict_v2.jinja @@ -0,0 +1,99 @@ +# Unit Test Generation Prompt v2.0 (Strict) +# Comprehensive test generation with advanced patterns + +You are a senior Python test engineer with expertise in test-driven development. Generate production-grade unit tests with comprehensive coverage. + +## Code Analysis +- **File**: `{{ file_path }}` +- **Complexity Score**: {{ complexity_score }} +- **Target Test Count**: {{ max_tests }} + +## Source Code +```python +{{ source_code }} +``` + +## Advanced Testing Requirements + +Generate {{ max_tests }} comprehensive tests covering ALL of the following: + +### 1. Functional Coverage +- **Happy paths**: Normal operation scenarios +- **Edge cases**: Boundary values, empty collections, extreme inputs +- **Error handling**: Exception paths, invalid states +- **State transitions**: Object lifecycle, state changes + +### 2. Quality Patterns +- **Arrange-Act-Assert** structure +- **Given-When-Then** scenarios +- **Property-based testing** where applicable +- **Parameterized tests** for multiple scenarios + +### 3. Advanced Techniques +- **Mock interactions**: Verify call patterns, not just return values +- **Context managers**: Test resource cleanup +- **Async/await**: If code contains async patterns +- **Thread safety**: If code has concurrency +- **Performance bounds**: Basic timing assertions + +### 4. Security Considerations +- **Input sanitization**: SQL injection, XSS prevention +- **Authorization**: Access control validation +- **Data exposure**: Sensitive information leakage + +## Technical Requirements + +- Use `pytest` with fixtures and parametrization +- Implement proper test isolation +- Include integration test patterns where relevant +- Use `hypothesis` for property-based tests when beneficial +- Mock all external dependencies (filesystem, network, databases) +- Test both success and failure scenarios thoroughly + +## Output Format + +Provide production-ready test code: + +```python +""" +Comprehensive unit tests for {{ file_path }} +Generated by QualityFlow (Strict Mode) +Coverage target: >95% line and branch coverage +""" + +import pytest +import unittest +from unittest.mock import Mock, patch, MagicMock, call +from hypothesis import given, strategies as st +import tempfile +import os +from contextlib import contextmanager + +# Import the module under test +# from {{ file_path.replace('/', '.').replace('.py', '') }} import * + +class Test{{ file_path.split('/')[-1].replace('.py', '').title() }}(unittest.TestCase): + """Comprehensive test suite for {{ file_path }}.""" + + def setUp(self): + """Set up test fixtures and mock objects.""" + pass + + def tearDown(self): + """Clean up after tests.""" + pass + + # Generated test methods with comprehensive coverage + + @pytest.mark.parametrize("input,expected", [ + # Add parameterized test cases + ]) + def test_parametrized_scenarios(self, input, expected): + """Test multiple scenarios with parameterization.""" + pass + +if __name__ == "__main__": + unittest.main() +``` + +Focus on realistic, maintainable tests that would pass code review in a production environment. \ No newline at end of file diff --git a/qualityflow/prompts/unit_test_v1.jinja b/qualityflow/prompts/unit_test_v1.jinja new file mode 100644 index 00000000..1c1cd444 --- /dev/null +++ b/qualityflow/prompts/unit_test_v1.jinja @@ -0,0 +1,61 @@ +# Unit Test Generation Prompt v1.0 +# Standard test generation for QualityFlow + +You are an expert Python test engineer. Generate comprehensive unit tests for the following code. + +## Code Analysis +- **File**: `{{ file_path }}` +- **Complexity Score**: {{ complexity_score }} +- **Target Test Count**: {{ max_tests }} + +## Source Code +```python +{{ source_code }} +``` + +## Instructions + +Generate {{ max_tests }} high-quality unit tests that cover: +1. **Happy path scenarios** - typical usage patterns +2. **Edge cases** - boundary conditions, empty inputs, None values +3. **Error conditions** - invalid inputs, exceptions +4. **Integration points** - mocked dependencies where applicable + +## Requirements + +- Use `pytest` and `unittest.TestCase` patterns +- Include proper docstrings for test methods +- Use `unittest.mock` for external dependencies +- Focus on behavioral testing, not implementation details +- Ensure tests are deterministic and repeatable +- Include setup/teardown if needed + +## Output Format + +Provide only the Python test code with no additional explanation: + +```python +""" +Unit tests for {{ file_path }} +Generated by QualityFlow +""" + +import pytest +import unittest +from unittest.mock import Mock, patch, MagicMock + +# Import the module under test +# from {{ file_path.replace('/', '.').replace('.py', '') }} import * + +class TestModule(unittest.TestCase): + """Test suite for {{ file_path }}.""" + + def setUp(self): + """Set up test fixtures.""" + pass + + # Your generated test methods here + +if __name__ == "__main__": + unittest.main() +``` \ No newline at end of file diff --git a/qualityflow/requirements.txt b/qualityflow/requirements.txt new file mode 100644 index 00000000..72c59212 --- /dev/null +++ b/qualityflow/requirements.txt @@ -0,0 +1,22 @@ +# ZenML and Core MLOps +zenml>=0.84.2 + +# Core Python Libraries +pydantic>=2.0.0,<3.0.0 +pyyaml>=6.0,<7.0 +jinja2>=3.0.0,<4.0.0 + +# Testing Framework +pytest>=7.0.0,<8.0.0 +pytest-cov>=4.0.0,<5.0.0 +coverage>=7.0.0,<8.0.0 + +# Code Analysis +ast>=3.9 + +# Git Integration +gitpython>=3.1.0,<4.0.0 + +# LLM Integration (optional) +openai>=1.0.0,<2.0.0 # for OpenAI provider +anthropic>=0.25.0,<1.0.0 # for Anthropic provider \ No newline at end of file diff --git a/qualityflow/run.py b/qualityflow/run.py new file mode 100644 index 00000000..11751350 --- /dev/null +++ b/qualityflow/run.py @@ -0,0 +1,55 @@ +""" +Entry point for running QualityFlow test generation pipeline. +""" + +from pathlib import Path + +import click +from pipelines import generate_and_evaluate +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@click.command() +@click.option( + "--config", + "-c", + type=click.Path(exists=True, dir_okay=False), + default=None, + required=False, + help="Path to configuration YAML file. Defaults to configs/experiment.default.yaml", +) +@click.option( + "--no-cache", + is_flag=True, + default=False, + help="Disable pipeline caching and force fresh execution", +) +def main(config: str | None, no_cache: bool): + """Run QualityFlow test generation and coverage analysis pipeline. + + Simple pipeline that generates tests using LLM, runs them, measures coverage, + and compares results against baseline approaches. + """ + + project_root = Path(__file__).parent + default_config = project_root / "configs" / "experiment.default.yaml" + chosen_config = config or str(default_config) + + try: + logger.info(f"Starting QualityFlow pipeline with config: {chosen_config}") + pipeline_instance = generate_and_evaluate.with_options( + config_path=chosen_config, + enable_cache=not no_cache + ) + pipeline_instance() + logger.info("QualityFlow pipeline completed successfully!") + + except Exception as e: + logger.error(f"Pipeline execution failed: {e}") + raise + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/qualityflow/steps/__init__.py b/qualityflow/steps/__init__.py new file mode 100644 index 00000000..70abca08 --- /dev/null +++ b/qualityflow/steps/__init__.py @@ -0,0 +1,21 @@ +"""QualityFlow pipeline steps.""" + +from .select_input import select_input +from .fetch_source import fetch_source +from .analyze_code import analyze_code +from .gen_tests_agent import gen_tests_agent +from .gen_tests_baseline import gen_tests_baseline +from .run_tests import run_tests +from .evaluate_coverage import evaluate_coverage +from .report import report + +__all__ = [ + "select_input", + "fetch_source", + "analyze_code", + "gen_tests_agent", + "gen_tests_baseline", + "run_tests", + "evaluate_coverage", + "report", +] \ No newline at end of file diff --git a/qualityflow/steps/analyze_code.py b/qualityflow/steps/analyze_code.py new file mode 100644 index 00000000..0bfebd9c --- /dev/null +++ b/qualityflow/steps/analyze_code.py @@ -0,0 +1,151 @@ +""" +Analyze and select code files for test generation. +""" + +import glob +import ast +import os +from pathlib import Path +from typing import Annotated, Dict, List, Tuple + +from zenml import step +from zenml.logger import get_logger +from enum import Enum + + +class SelectionStrategy(str, Enum): + """Code file selection strategies.""" + LOW_COVERAGE = "low_coverage" + CHANGED_FILES = "changed_files" + ALL = "all" + +logger = get_logger(__name__) + + +@step +def analyze_code( + workspace_dir: Path, + commit_sha: str, + target_glob: str = "src/**/*.py", + strategy: SelectionStrategy = SelectionStrategy.LOW_COVERAGE, + max_files: int = 10, +) -> Annotated[Dict, "code_summary"]: + """ + Analyze workspace and select candidate files for test generation. + + Args: + workspace_dir: Path to workspace directory + commit_sha: Git commit SHA + target_glob: Glob pattern for target files + strategy: File selection strategy + max_files: Maximum number of files to select + + Returns: + Code summary dictionary containing selected files and metadata + """ + logger.info(f"Analyzing code in {workspace_dir} with strategy {strategy}") + + workspace_path = Path(workspace_dir) + + # Find all Python files matching glob pattern + all_files = [] + for pattern in target_glob.split(","): + pattern = pattern.strip() + matched_files = glob.glob(str(workspace_path / pattern), recursive=True) + all_files.extend(matched_files) + + # Make paths relative to workspace + relative_files = [ + os.path.relpath(f, workspace_dir) + for f in all_files + if f.endswith('.py') and os.path.isfile(f) + ] + + logger.info(f"Found {len(relative_files)} Python files") + + # Calculate complexity scores + complexity_scores = {} + valid_files = [] + + for file_path in relative_files: + full_path = workspace_path / file_path + try: + with open(full_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Parse AST and calculate basic complexity + tree = ast.parse(content) + complexity = _calculate_complexity(tree) + complexity_scores[file_path] = complexity + valid_files.append(file_path) + + except (SyntaxError, UnicodeDecodeError) as e: + logger.warning(f"Skipping {file_path} due to parsing error: {e}") + continue + + # Select files based on strategy + selected_files = _select_files(valid_files, complexity_scores, strategy, max_files) + + code_summary = { + "selected_files": selected_files, + "total_files": len(valid_files), + "selection_reason": f"Selected top {len(selected_files)} files using {strategy} strategy", + "complexity_scores": {f: complexity_scores[f] for f in selected_files} + } + + logger.info(f"Selected {len(selected_files)} files: {selected_files}") + + return code_summary + + +def _calculate_complexity(tree: ast.AST) -> float: + """Calculate basic complexity score for an AST.""" + class ComplexityVisitor(ast.NodeVisitor): + def __init__(self): + self.complexity = 0 + self.functions = 0 + self.classes = 0 + + def visit_FunctionDef(self, node): + self.functions += 1 + self.complexity += 1 + for child in ast.walk(node): + if isinstance(child, (ast.If, ast.For, ast.While, ast.Try)): + self.complexity += 1 + self.generic_visit(node) + + def visit_ClassDef(self, node): + self.classes += 1 + self.complexity += 1 + self.generic_visit(node) + + visitor = ComplexityVisitor() + visitor.visit(tree) + + # Combine metrics into single score + return visitor.complexity + visitor.functions * 0.5 + visitor.classes * 2 + + +def _select_files( + files: List[str], + complexity_scores: Dict[str, float], + strategy: SelectionStrategy, + max_files: int +) -> List[str]: + """Select files based on strategy.""" + + if strategy == SelectionStrategy.ALL: + return files[:max_files] + + elif strategy == SelectionStrategy.LOW_COVERAGE: + # Prioritize complex files that likely need more tests + sorted_files = sorted(files, key=lambda f: complexity_scores[f], reverse=True) + return sorted_files[:max_files] + + elif strategy == SelectionStrategy.CHANGED_FILES: + # For this demo, just return all files (in real implementation, would use git diff) + logger.warning("CHANGED_FILES strategy not fully implemented, falling back to ALL") + return files[:max_files] + + else: + raise ValueError(f"Unknown selection strategy: {strategy}") \ No newline at end of file diff --git a/qualityflow/steps/evaluate_coverage.py b/qualityflow/steps/evaluate_coverage.py new file mode 100644 index 00000000..9d384b10 --- /dev/null +++ b/qualityflow/steps/evaluate_coverage.py @@ -0,0 +1,68 @@ +""" +Evaluate coverage metrics and compare against baselines. +""" + +from typing import Annotated, Dict, Optional +from zenml import step, Model +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def evaluate_coverage( + agent_results: Dict, + baseline_results: Optional[Dict], + commit_sha: str, +) -> Annotated[Dict, "evaluation_metrics"]: + """ + Evaluate coverage metrics and compare agent vs baseline approaches. + + Args: + agent_results: Test results from agent-generated tests + baseline_results: Test results from baseline tests (optional) + commit_sha: Current commit SHA + + Returns: + Evaluation metrics dictionary with coverage comparison + """ + logger.info("Evaluating coverage metrics and computing deltas") + + # Extract agent metrics + coverage_total_agent = agent_results.get("coverage_total", 0.0) + tests_passed_agent = agent_results.get("tests_passed", 0) + tests_failed_agent = agent_results.get("tests_failed", 0) + + total_tests_agent = tests_passed_agent + tests_failed_agent + pass_rate_agent = tests_passed_agent / total_tests_agent if total_tests_agent > 0 else 0.0 + + # Extract baseline metrics + coverage_total_baseline = None + if baseline_results and not baseline_results.get("skipped", False): + coverage_total_baseline = baseline_results.get("coverage_total", 0.0) + + # Compare agent vs baseline coverage + coverage_improvement = 0.0 + if coverage_total_baseline is not None: + coverage_improvement = coverage_total_agent - coverage_total_baseline + + # Analyze coverage quality + pass_rate_quality = "excellent" if pass_rate_agent > 0.95 else "good" if pass_rate_agent > 0.8 else "needs_improvement" + coverage_quality = "excellent" if coverage_total_agent > 80 else "good" if coverage_total_agent > 50 else "needs_improvement" + + evaluation_metrics = { + "coverage_total_agent": coverage_total_agent, + "coverage_total_baseline": coverage_total_baseline, + "coverage_improvement": coverage_improvement, + "tests_passed_agent": tests_passed_agent, + "tests_failed_agent": tests_failed_agent, + "pass_rate_agent": pass_rate_agent, + "pass_rate_quality": pass_rate_quality, + "coverage_quality": coverage_quality, + "commit_sha": commit_sha, + "files_analyzed": len(agent_results.get("coverage_by_file", {})), + } + + logger.info(f"Evaluation complete: agent_coverage={coverage_total_agent:.2f}%, baseline_coverage={coverage_total_baseline or 0:.2f}%, improvement={coverage_improvement:+.2f}%") + + return evaluation_metrics \ No newline at end of file diff --git a/qualityflow/steps/fetch_source.py b/qualityflow/steps/fetch_source.py new file mode 100644 index 00000000..c117f2d2 --- /dev/null +++ b/qualityflow/steps/fetch_source.py @@ -0,0 +1,70 @@ +""" +Fetch source code workspace step. +""" + +import tempfile +import subprocess +from pathlib import Path +from typing import Annotated, Dict, Tuple + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def fetch_source( + source_spec: Dict[str, str], +) -> Tuple[Annotated[Path, "workspace_dir"], Annotated[str, "commit_sha"]]: + """ + Fetch and materialize workspace from git repository. + + Args: + source_spec: Source specification from select_input step + + Returns: + Tuple of workspace directory path and commit SHA + """ + repo_url = source_spec["repo_url"] + ref = source_spec["ref"] + + logger.info(f"Fetching source from {repo_url}@{ref}") + + # Create temporary workspace + workspace_dir = tempfile.mkdtemp(prefix="qualityflow_workspace_") + workspace_path = Path(workspace_dir) + + try: + # Clone repository + logger.info(f"Cloning {repo_url} to {workspace_dir}") + subprocess.run( + ["git", "clone", "--depth", "1", "--branch", ref, repo_url, workspace_dir], + check=True, + capture_output=True, + text=True, + ) + + # Get commit SHA + result = subprocess.run( + ["git", "rev-parse", "HEAD"], + cwd=workspace_dir, + check=True, + capture_output=True, + text=True, + ) + commit_sha = result.stdout.strip() + + logger.info(f"Workspace ready at {workspace_dir}, commit: {commit_sha}") + + return Path(workspace_dir), commit_sha + + except subprocess.CalledProcessError as e: + logger.error(f"Failed to fetch source: {e}") + raise RuntimeError(f"Git operation failed: {e.stderr}") + except Exception as e: + logger.error(f"Unexpected error fetching source: {e}") + # Clean up on error + import shutil + shutil.rmtree(workspace_dir, ignore_errors=True) + raise \ No newline at end of file diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py new file mode 100644 index 00000000..8ed37d31 --- /dev/null +++ b/qualityflow/steps/gen_tests_agent.py @@ -0,0 +1,358 @@ +""" +Generate tests using LLM agent. +""" + +import tempfile +from pathlib import Path +from typing import Annotated, Dict, List, Tuple +from jinja2 import Template + +from zenml import step +from zenml.logger import get_logger +from zenml import log_metadata +from enum import Enum + + +class GenerationProvider(str, Enum): + """LLM providers for test generation.""" + OPENAI = "openai" + ANTHROPIC = "anthropic" + FAKE = "fake" + +logger = get_logger(__name__) + + +@step +def gen_tests_agent( + workspace_dir: Path, + code_summary: Dict, + provider: GenerationProvider = GenerationProvider.FAKE, + model: str = "gpt-4o-mini", + prompt_path: str = "prompts/unit_test_v1.jinja", + max_tests_per_file: int = 3, + max_files: int = 10, +) -> Tuple[ + Annotated[Path, "agent_tests_dir"], + Annotated[str, "prompt_used"] +]: + """Generate tests using LLM agent. + + Args: + workspace_dir: Path to workspace directory + code_summary: Code analysis summary containing selected files + provider: LLM provider to use + model: Model name + prompt_path: Path to Jinja2 prompt template + max_tests_per_file: Maximum tests to generate per file + max_files: Maximum number of files to process (for speed control) + + Returns: + Tuple of test directory and prompt used + """ + # Extract selected files from code summary + selected_files = code_summary.get("selected_files", []) + + # Limit files if max_files is specified + files_to_process = selected_files[:max_files] if max_files > 0 else selected_files + logger.info(f"Generating tests for {len(files_to_process)}/{len(selected_files)} files using {provider}:{model}") + + # Create tests directory + tests_dir = tempfile.mkdtemp(prefix="qualityflow_agent_tests_") + tests_path = Path(tests_dir) + + # Load prompt template + workspace_path = Path(workspace_dir) + prompt_file = workspace_path / prompt_path + + if prompt_file.exists(): + with open(prompt_file, 'r') as f: + prompt_template = f.read() + else: + # Use default template if file doesn't exist + prompt_template = _get_default_prompt_template() + logger.info(f"Using default prompt template, {prompt_path} not found") + + template = Template(prompt_template) + + total_tokens_in = 0 + total_tokens_out = 0 + materialized_prompts = {} # Store materialized prompts per file + + for file_path in files_to_process: + logger.info(f"Generating tests for {file_path}") + + # Read source file + full_file_path = workspace_path / file_path + with open(full_file_path, 'r') as f: + source_code = f.read() + + # Render prompt + materialized_prompt = template.render( + file_path=file_path, + source_code=source_code, + max_tests=max_tests_per_file, + complexity_score=code_summary.get("complexity_scores", {}).get(file_path, 0) + ) + + # Store the materialized prompt for this file + materialized_prompts[file_path] = materialized_prompt + + # Generate tests using provider + if provider == GenerationProvider.FAKE: + generated_tests, tokens = _generate_fake_tests(file_path, source_code, max_tests_per_file) + elif provider == GenerationProvider.OPENAI: + generated_tests, tokens = _generate_openai_tests(materialized_prompt, model) + elif provider == GenerationProvider.ANTHROPIC: + generated_tests, tokens = _generate_anthropic_tests(materialized_prompt, model) + else: + raise ValueError(f"Unsupported provider: {provider}") + + total_tokens_in += tokens.get("tokens_in", 0) + total_tokens_out += tokens.get("tokens_out", 0) + + # Save generated tests + test_file_name = f"test_{Path(file_path).stem}.py" + test_file_path = tests_path / test_file_name + + with open(test_file_path, 'w') as f: + f.write(generated_tests) + + logger.info(f"Generated tests saved to {test_file_path}") + + # Log comprehensive metadata including materialized prompts + metadata = { + "token_usage": { + "tokens_in": total_tokens_in, + "tokens_out": total_tokens_out, + "cost_estimate": _estimate_cost(total_tokens_in, total_tokens_out, provider, model), + }, + "config": { + "provider": provider.value, + "model": model, + "prompt_template_path": prompt_path, + "max_tests_per_file": max_tests_per_file, + "files_processed": len(files_to_process), + }, + "materialized_prompts": materialized_prompts, + "prompt_template": prompt_template, + } + + log_metadata(metadata) + logger.info(f"Test generation complete. Files: {len(files_to_process)}, Tokens: {total_tokens_in} in / {total_tokens_out} out") + + # Create a better prompt summary for the report + prompt_summary = f"Template: {prompt_path}\nProvider: {provider.value}\nModel: {model}\nFiles processed: {len(files_to_process)}" + + # Return Path object - ZenML will automatically materialize the folder + return Path(tests_dir), prompt_summary + + +def _get_default_prompt_template() -> str: + """Default Jinja2 prompt template for test generation.""" + return """# Generate unit tests for the following Python code + +File: {{ file_path }} +Complexity Score: {{ complexity_score }} +Max Tests: {{ max_tests }} + +## Source Code: +```python +{{ source_code }} +``` + +## Instructions: +Generate {{ max_tests }} comprehensive unit tests for the functions and classes in this code. +Focus on edge cases, error conditions, and typical usage patterns. + +## Generated Tests: +""" + + +def _generate_fake_tests(file_path: str, source_code: str, max_tests: int) -> Tuple[str, Dict]: + """Generate fake/mock tests for development/testing.""" + # Create a simple module name from file path + module_name = file_path.replace('/', '.').replace('.py', '') + + test_content = f'''""" +Generated tests for {file_path} +""" + +import pytest +import unittest +from unittest.mock import Mock, patch, MagicMock + +class Test{file_path.split('/')[-1].replace('.py', '').title()}(unittest.TestCase): + """Auto-generated test class for {file_path}.""" + + def test_module_import(self): + """Test that we can at least validate the test framework.""" + # Simple test that always passes to ensure test discovery works + self.assertTrue(True) + + def test_basic_functionality(self): + """Test basic functionality.""" + # Mock test demonstrating test execution + result = 1 + 1 + self.assertEqual(result, 2) + + def test_error_handling(self): + """Test error handling.""" + # Test exception handling + with self.assertRaises(ValueError): + raise ValueError("Expected test exception") + + def test_mock_usage(self): + """Test mock functionality.""" + # Test using mocks + mock_obj = Mock() + mock_obj.method.return_value = "mocked_result" + result = mock_obj.method() + self.assertEqual(result, "mocked_result") + + def test_coverage_target(self): + """Test that generates some coverage.""" + # Simple operations to generate coverage + data = {{"key": "value"}} + self.assertIn("key", data) + + items = [1, 2, 3, 4, 5] + filtered = [x for x in items if x > 3] + self.assertEqual(len(filtered), 2) + +if __name__ == "__main__": + unittest.main() +''' + + tokens = {"tokens_in": 100, "tokens_out": 50} + return test_content, tokens + + +def _generate_openai_tests(prompt: str, model: str) -> Tuple[str, Dict]: + """Generate tests using OpenAI API.""" + try: + import openai + import os + + # Get API key from environment + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + logger.warning("OPENAI_API_KEY not found, using fake tests") + return _generate_fake_tests("openai_file", "mock_code", 3) + + client = openai.OpenAI(api_key=api_key) + + # Call OpenAI API + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": "You are a Python test generation expert. Generate comprehensive unit tests for the given code."}, + {"role": "user", "content": prompt} + ], + max_tokens=2000, + temperature=0.1 + ) + + # Extract test code from response + generated_content = response.choices[0].message.content + + # Try to extract Python code blocks + if "```python" in generated_content: + start = generated_content.find("```python") + 9 + end = generated_content.find("```", start) + test_content = generated_content[start:end].strip() + elif "```" in generated_content: + start = generated_content.find("```") + 3 + end = generated_content.find("```", start) + test_content = generated_content[start:end].strip() + else: + # Use the whole response if no code blocks found + test_content = generated_content.strip() + + # Token usage for cost estimation + tokens = { + "tokens_in": response.usage.prompt_tokens, + "tokens_out": response.usage.completion_tokens + } + + logger.info(f"Generated tests using OpenAI {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out") + return test_content, tokens + + except ImportError: + logger.warning("OpenAI library not installed, using fake tests") + return _generate_fake_tests("openai_file", "mock_code", 3) + except Exception as e: + logger.error(f"Failed to generate tests with OpenAI: {e}") + logger.warning("Falling back to fake tests") + return _generate_fake_tests("openai_file", "mock_code", 3) + + +def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]: + """Generate tests using Anthropic API.""" + try: + import anthropic + import os + + # Get API key from environment + api_key = os.getenv("ANTHROPIC_API_KEY") + if not api_key: + logger.warning("ANTHROPIC_API_KEY not found, using fake tests") + return _generate_fake_tests("anthropic_file", "mock_code", 3) + + client = anthropic.Anthropic(api_key=api_key) + + # Call Anthropic API + response = client.messages.create( + model=model, + max_tokens=2000, + temperature=0.1, + messages=[ + {"role": "user", "content": f"You are a Python test generation expert. Generate comprehensive unit tests for the given code.\n\n{prompt}"} + ] + ) + + # Extract test content from response + generated_content = response.content[0].text + + # Try to extract Python code blocks + if "```python" in generated_content: + start = generated_content.find("```python") + 9 + end = generated_content.find("```", start) + test_content = generated_content[start:end].strip() + elif "```" in generated_content: + start = generated_content.find("```") + 3 + end = generated_content.find("```", start) + test_content = generated_content[start:end].strip() + else: + # Use the whole response if no code blocks found + test_content = generated_content.strip() + + # Token usage for cost estimation + tokens = { + "tokens_in": response.usage.input_tokens, + "tokens_out": response.usage.output_tokens + } + + logger.info(f"Generated tests using Anthropic {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out") + return test_content, tokens + + except ImportError: + logger.warning("Anthropic library not installed, using fake tests") + return _generate_fake_tests("anthropic_file", "mock_code", 3) + except Exception as e: + logger.error(f"Failed to generate tests with Anthropic: {e}") + logger.warning("Falling back to fake tests") + return _generate_fake_tests("anthropic_file", "mock_code", 3) + + +def _estimate_cost(tokens_in: int, tokens_out: int, provider: GenerationProvider, model: str) -> float: + """Estimate cost based on token usage.""" + # Rough cost estimates (would need real pricing) + if provider == GenerationProvider.OPENAI: + if "gpt-4" in model: + return (tokens_in * 0.00003) + (tokens_out * 0.00006) + else: # gpt-3.5 + return (tokens_in * 0.0000015) + (tokens_out * 0.000002) + elif provider == GenerationProvider.ANTHROPIC: + return (tokens_in * 0.000008) + (tokens_out * 0.000024) + else: + return 0.0 \ No newline at end of file diff --git a/qualityflow/steps/gen_tests_baseline.py b/qualityflow/steps/gen_tests_baseline.py new file mode 100644 index 00000000..68a0a4e4 --- /dev/null +++ b/qualityflow/steps/gen_tests_baseline.py @@ -0,0 +1,190 @@ +""" +Generate baseline/skeleton tests using heuristics. +""" + +import tempfile +import ast +from pathlib import Path +from typing import Annotated, Dict, List, Optional + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def gen_tests_baseline( + workspace_dir: Path, + code_summary: Dict, + enabled: bool = True, + max_files: int = 10, +) -> Annotated[Optional[Path], "baseline_tests_dir"]: + """ + Generate baseline/skeleton tests using heuristic analysis. + + Args: + workspace_dir: Path to workspace directory + code_summary: Code analysis summary containing selected files + enabled: Whether baseline generation is enabled + max_files: Maximum number of files to process + + Returns: + Path to baseline tests directory, or None if disabled + """ + if not enabled: + logger.info("Baseline test generation disabled") + return None + + # Extract selected files from code summary + selected_files = code_summary.get("selected_files", []) + + # Limit files if max_files is specified + files_to_process = selected_files[:max_files] if max_files > 0 else selected_files + logger.info(f"Generating baseline tests for {len(files_to_process)}/{len(selected_files)} files") + + # Create baseline tests directory + tests_dir = tempfile.mkdtemp(prefix="qualityflow_baseline_tests_") + tests_path = Path(tests_dir) + + workspace_path = Path(workspace_dir) + + for file_path in files_to_process: + logger.info(f"Generating baseline tests for {file_path}") + + # Read and parse source file + full_file_path = workspace_path / file_path + with open(full_file_path, 'r') as f: + source_code = f.read() + + try: + tree = ast.parse(source_code) + + # Extract functions and classes + functions, classes = _extract_testable_items(tree) + + # Generate skeleton tests + test_content = _generate_skeleton_tests(file_path, functions, classes) + + # Save baseline tests + test_file_name = f"test_{Path(file_path).stem}_baseline.py" + test_file_path = tests_path / test_file_name + + with open(test_file_path, 'w') as f: + f.write(test_content) + + logger.info(f"Baseline tests saved to {test_file_path}") + + except SyntaxError as e: + logger.warning(f"Skipping {file_path} due to syntax error: {e}") + continue + + logger.info("Baseline test generation complete") + + # Return Path object - ZenML will automatically materialize the folder + return Path(tests_dir) + + +def _extract_testable_items(tree: ast.AST) -> tuple[List[str], List[str]]: + """Extract function and class names from AST.""" + functions = [] + classes = [] + + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef): + # Skip private functions (starting with _) + if not node.name.startswith('_'): + functions.append(node.name) + elif isinstance(node, ast.ClassDef): + # Skip private classes + if not node.name.startswith('_'): + classes.append(node.name) + + return functions, classes + + +def _generate_skeleton_tests(file_path: str, functions: List[str], classes: List[str]) -> str: + """Generate skeleton test content.""" + + # Create imports section + imports = f'''""" +Baseline/skeleton tests for {file_path} +Generated using heuristic analysis. +""" + +import pytest +import unittest +from unittest.mock import Mock, patch +''' + + # Try to determine import path from file path + module_path = file_path.replace('/', '.').replace('.py', '') + if module_path.startswith('src.'): + module_path = module_path[4:] # Remove 'src.' prefix + + if functions or classes: + imports += f"# from {module_path} import {', '.join(functions + classes)}\n\n" + else: + imports += f"# from {module_path} import *\n\n" + + # Generate function tests + function_tests = "" + for func_name in functions: + function_tests += f''' +def test_{func_name}_basic(): + """Basic test for {func_name}.""" + # TODO: Implement test for {func_name} + pass + +def test_{func_name}_error_cases(): + """Error case test for {func_name}.""" + # TODO: Test error conditions for {func_name} + pass +''' + + # Generate class tests + class_tests = "" + for class_name in classes: + class_tests += f''' +class Test{class_name}(unittest.TestCase): + """Test suite for {class_name}.""" + + def setUp(self): + """Set up test fixtures.""" + # TODO: Initialize test fixtures + pass + + def test_{class_name.lower()}_init(self): + """Test {class_name} initialization.""" + # TODO: Test class initialization + pass + + def test_{class_name.lower()}_methods(self): + """Test {class_name} methods.""" + # TODO: Test class methods + pass +''' + + # Add default test if no functions or classes found + if not functions and not classes: + default_test = ''' +class TestModule(unittest.TestCase): + """Default test suite for module.""" + + def test_module_imports(self): + """Test that module can be imported.""" + # TODO: Add import test + pass +''' + class_tests += default_test + + # Combine all parts + test_content = imports + function_tests + class_tests + + # Add main block + test_content += ''' +if __name__ == "__main__": + unittest.main() +''' + + return test_content \ No newline at end of file diff --git a/qualityflow/steps/report.py b/qualityflow/steps/report.py new file mode 100644 index 00000000..141ecda2 --- /dev/null +++ b/qualityflow/steps/report.py @@ -0,0 +1,238 @@ +""" +Generate comprehensive pipeline report. +""" + +import tempfile +from pathlib import Path +from typing import Annotated, Dict, Optional +from datetime import datetime + +from zenml import step +from zenml.logger import get_logger +from zenml.types import MarkdownString + +logger = get_logger(__name__) + + +@step +def report( + workspace_dir: Path, + commit_sha: str, + prompt_used: str, + agent_results: Dict, + baseline_results: Optional[Dict], +) -> Annotated[MarkdownString, "final_report"]: + """ + Generate comprehensive markdown report for pipeline execution. + + Args: + workspace_dir: Workspace directory path + commit_sha: Git commit SHA + prompt_used: Prompt template used + agent_results: Agent test results + baseline_results: Baseline test results (optional) + + Returns: + Path to generated markdown report + """ + logger.info("Generating pipeline execution report") + + # Create report file + report_file = Path(tempfile.mkdtemp(prefix="qualityflow_report_")) / "report.md" + + # Evaluate coverage metrics first + evaluation_metrics = _evaluate_coverage_metrics(agent_results, baseline_results, commit_sha) + + # Generate report content + report_content = _generate_report_content( + workspace_dir, + commit_sha, + prompt_used, + agent_results, + baseline_results, + evaluation_metrics, + ) + + # Write report file + with open(report_file, 'w') as f: + f.write(report_content) + + logger.info(f"Report generated: {report_file}") + + # Return as MarkdownString for dashboard visualization + return MarkdownString(report_content) + + +def _evaluate_coverage_metrics( + agent_results: Dict, + baseline_results: Optional[Dict], + commit_sha: str, +) -> Dict: + """Evaluate coverage metrics and compare agent vs baseline approaches.""" + + # Extract agent metrics - use actual values from test results + coverage_total_agent = agent_results.get("coverage_total", 0.0) + tests_passed_agent = agent_results.get("tests_passed", 0) + tests_failed_agent = agent_results.get("tests_failed", 0) + + total_tests_agent = tests_passed_agent + tests_failed_agent + pass_rate_agent = tests_passed_agent / total_tests_agent if total_tests_agent > 0 else 0.0 + + # Extract baseline metrics + coverage_total_baseline = 0.0 + if baseline_results and not baseline_results.get("skipped", False): + coverage_total_baseline = baseline_results.get("coverage_total", 0.0) + + # Compare agent vs baseline coverage + coverage_improvement = coverage_total_agent - coverage_total_baseline + + # Analyze coverage quality + pass_rate_quality = "excellent" if pass_rate_agent > 0.95 else "good" if pass_rate_agent > 0.8 else "needs_improvement" + coverage_quality = "excellent" if coverage_total_agent > 80 else "good" if coverage_total_agent > 50 else "needs_improvement" + + return { + "coverage_total_agent": coverage_total_agent, + "coverage_total_baseline": coverage_total_baseline, + "coverage_improvement": coverage_improvement, + "tests_passed_agent": tests_passed_agent, + "tests_failed_agent": tests_failed_agent, + "pass_rate_agent": pass_rate_agent, + "pass_rate_quality": pass_rate_quality, + "coverage_quality": coverage_quality, + "commit_sha": commit_sha, + "files_analyzed": len(agent_results.get("coverage_by_file", {})), + } + + +def _generate_report_content( + workspace_dir: Path, + commit_sha: str, + prompt_used: str, + agent_results: Dict, + baseline_results: Optional[Dict], + evaluation_metrics: Dict, +) -> str: + """Generate markdown report content.""" + + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # Header + report = f"""# QualityFlow Pipeline Report + +Generated: {timestamp} +Commit: `{commit_sha}` +Workspace: `{workspace_dir}` + +## Executive Summary + +""" + + # Executive summary + coverage_agent = evaluation_metrics.get("coverage_total_agent", 0.0) + coverage_baseline = evaluation_metrics.get("coverage_total_baseline", 0.0) + improvement = evaluation_metrics.get("coverage_improvement", 0.0) + quality = evaluation_metrics.get("coverage_quality", "unknown") + + quality_emoji = "๐ŸŸข" if quality == "excellent" else "๐ŸŸก" if quality == "good" else "๐Ÿ”ด" + improvement_emoji = "๐Ÿ“ˆ" if improvement > 0 else "๐Ÿ“‰" if improvement < 0 else "โžก๏ธ" + + report += f"""{quality_emoji} **Coverage Quality**: {quality.upper()} +{improvement_emoji} **Agent vs Baseline**: {coverage_agent:.2f}% vs {coverage_baseline:.2f}% ({improvement:+.2f}%) +๐Ÿงช **Tests**: {agent_results.get('tests_passed', 0)} passed, {agent_results.get('tests_failed', 0)} failed +๐Ÿ“ **Files**: {evaluation_metrics.get('files_analyzed', 0)} analyzed + +""" + + # Agent results section + report += """## Agent Test Results + +""" + + if agent_results.get("skipped", False): + report += "Agent tests were skipped.\n\n" + else: + report += f"""- **Tests Passed**: {agent_results.get('tests_passed', 0)} +- **Tests Failed**: {agent_results.get('tests_failed', 0)} +- **Pass Rate**: {evaluation_metrics.get('pass_rate_agent', 0.0):.1%} +- **Coverage**: {agent_results.get('coverage_total', 0.0):.2f}% +- **JUnit Report**: `{agent_results.get('junit_path', 'N/A')}` +- **Coverage Report**: `{agent_results.get('coverage_path', 'N/A')}` +- **Logs**: `{agent_results.get('logs_path', 'N/A')}` + +""" + + # Baseline results section (if available) + if baseline_results and not baseline_results.get("skipped", False): + report += """## Baseline Test Results + +""" + report += f"""- **Tests Passed**: {baseline_results.get('tests_passed', 0)} +- **Tests Failed**: {baseline_results.get('tests_failed', 0)} +- **Coverage**: {baseline_results.get('coverage_total', 0.0):.2f}% +- **JUnit Report**: `{baseline_results.get('junit_path', 'N/A')}` +- **Coverage Report**: `{baseline_results.get('coverage_path', 'N/A')}` + +""" + + # Evaluation metrics section + report += """## Coverage Analysis + +""" + + pass_rate = evaluation_metrics.get("pass_rate_agent", 0.0) + pass_quality = evaluation_metrics.get("pass_rate_quality", "unknown") + + report += f"""- **Agent Coverage**: {coverage_agent:.2f}% ({quality}) +- **Baseline Coverage**: {coverage_baseline:.2f}% +- **Improvement**: {improvement:+.2f}% +- **Test Pass Rate**: {pass_rate:.1%} ({pass_quality}) +- **Files Analyzed**: {evaluation_metrics.get('files_analyzed', 0)} + +""" + + # Recommendations section + report += """## Recommendations + +""" + if quality == "excellent": + report += "๐ŸŽ‰ **Excellent coverage!** Consider this approach for production use.\n" + elif quality == "good": + report += "๐Ÿ‘ **Good coverage.** Consider tweaking prompts or selection strategy for improvement.\n" + else: + report += "โš ๏ธ **Coverage needs improvement.** Try different prompts, models, or increase max_tests_per_file.\n" + + if improvement > 5: + report += "๐Ÿ“ˆ **Agent significantly outperforms baseline** - LLM approach is working well.\n" + elif improvement > 0: + report += "๐Ÿ“Š **Agent slightly better than baseline** - room for optimization.\n" + else: + report += "๐Ÿ“‰ **Baseline performs as well or better** - review agent configuration.\n" + + # Configuration section + report += """## Configuration + +### Prompt Template +``` +""" + report += prompt_used[:500] + ("..." if len(prompt_used) > 500 else "") + report += """ +``` + +### File Coverage Details +""" + + coverage_by_file = agent_results.get("coverage_by_file", {}) + if coverage_by_file: + report += "| File | Coverage |\n|------|----------|\n" + for file_path, coverage_pct in sorted(coverage_by_file.items()): + report += f"| `{file_path}` | {coverage_pct:.1f}% |\n" + else: + report += "No file-level coverage data available.\n" + + report += """ + +--- +*Generated by QualityFlow - Production-ready test generation with ZenML* +""" + + return report \ No newline at end of file diff --git a/qualityflow/steps/run_tests.py b/qualityflow/steps/run_tests.py new file mode 100644 index 00000000..a4d12385 --- /dev/null +++ b/qualityflow/steps/run_tests.py @@ -0,0 +1,258 @@ +""" +Run tests and collect coverage metrics. +""" + +import subprocess +import tempfile +import shutil +from pathlib import Path +from typing import Annotated, Dict, Optional + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def run_tests( + workspace_dir: Path, + tests_dir: Optional[Path], + label: str = "tests", +) -> Annotated[Dict, "test_results"]: + """Run tests and collect coverage metrics. + + Args: + workspace_dir: Path to workspace directory + tests_dir: Path object to tests directory (None if no tests) + label: Label for this test run + + Returns: + Dictionary containing test results and metrics + """ + if tests_dir is None: + logger.info(f"No tests directory provided for {label}, skipping") + return { + "label": label, + "tests_passed": 0, + "tests_failed": 0, + "coverage_total": 0.0, + "coverage_by_file": {}, + "junit_path": None, + "coverage_path": None, + "logs_path": None, + "skipped": True, + } + + logger.info(f"Running {label} tests from {tests_dir}") + + # Create output directory for this test run + output_dir = tempfile.mkdtemp(prefix=f"qualityflow_{label}_results_") + output_path = Path(output_dir) + + junit_file = output_path / "junit.xml" + coverage_file = output_path / "coverage.xml" + logs_file = output_path / "test_logs.txt" + + # Copy tests to workspace (pytest needs them in PYTHONPATH) + workspace_tests_dir = Path(workspace_dir) / f"tests_{label}" + if workspace_tests_dir.exists(): + shutil.rmtree(workspace_tests_dir) + shutil.copytree(tests_dir, workspace_tests_dir) + + try: + # Run pytest with coverage + pytest_cmd = [ + "python", "-m", "pytest", + str(workspace_tests_dir), + "--junitxml", str(junit_file), + "--cov", str(workspace_dir), + "--cov-report", f"xml:{coverage_file}", + "--cov-report", "term", + "-v" + ] + + logger.info(f"Running command: {' '.join(pytest_cmd)}") + logger.info(f"Working directory: {workspace_dir}") + logger.info(f"Test directory: {workspace_tests_dir}") + + # Debug: list test files + if workspace_tests_dir.exists(): + test_files = list(workspace_tests_dir.glob("*.py")) + logger.info(f"Test files found: {[f.name for f in test_files]}") + else: + logger.warning(f"Test directory does not exist: {workspace_tests_dir}") + + result = subprocess.run( + pytest_cmd, + cwd=str(workspace_dir), + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + ) + + # Save logs and also log to console for debugging + with open(logs_file, 'w') as f: + f.write(f"Command: {' '.join(pytest_cmd)}\n") + f.write(f"Return code: {result.returncode}\n\n") + f.write("STDOUT:\n") + f.write(result.stdout) + f.write("\nSTDERR:\n") + f.write(result.stderr) + + # Also log the pytest output for debugging + logger.info(f"Pytest return code: {result.returncode}") + if result.stdout: + logger.info(f"Pytest stdout: {result.stdout}") + if result.stderr: + logger.info(f"Pytest stderr: {result.stderr}") + + # Parse results + test_results = _parse_test_results( + result, junit_file, coverage_file, logs_file, label + ) + + logger.info(f"Test run complete for {label}: {test_results['tests_passed']} passed, {test_results['tests_failed']} failed, {test_results['coverage_total']:.2f}% coverage") + + return test_results + + except subprocess.TimeoutExpired: + logger.error(f"Test run for {label} timed out after 5 minutes") + return { + "label": label, + "tests_passed": 0, + "tests_failed": 1, + "coverage_total": 0.0, + "coverage_by_file": {}, + "junit_path": str(junit_file) if junit_file.exists() else None, + "coverage_path": str(coverage_file) if coverage_file.exists() else None, + "logs_path": str(logs_file), + "error": "Test execution timed out", + } + + except Exception as e: + logger.error(f"Failed to run tests for {label}: {e}") + return { + "label": label, + "tests_passed": 0, + "tests_failed": 1, + "coverage_total": 0.0, + "coverage_by_file": {}, + "junit_path": str(junit_file) if junit_file.exists() else None, + "coverage_path": str(coverage_file) if coverage_file.exists() else None, + "logs_path": str(logs_file) if logs_file.exists() else None, + "error": str(e), + } + + finally: + # Clean up copied tests + if workspace_tests_dir.exists(): + shutil.rmtree(workspace_tests_dir, ignore_errors=True) + + +def _parse_test_results( + result: subprocess.CompletedProcess, + junit_file: Path, + coverage_file: Path, + logs_file: Path, + label: str, +) -> Dict: + """Parse test execution results.""" + + # Parse pytest output for basic stats + tests_passed = 0 + tests_failed = 0 + + if result.stdout: + lines = result.stdout.split('\n') + for line in lines: + if ' passed' in line and ' failed' in line: + # Line like "2 failed, 3 passed in 1.23s" + parts = line.split() + for i, part in enumerate(parts): + if part == 'passed' and i > 0: + tests_passed = int(parts[i-1]) + elif part == 'failed' and i > 0: + tests_failed = int(parts[i-1]) + elif ' passed' in line and 'failed' not in line: + # Line like "5 passed in 1.23s" + parts = line.split() + for i, part in enumerate(parts): + if part == 'passed' and i > 0: + tests_passed = int(parts[i-1]) + + # Parse coverage from XML if available + coverage_total = 0.0 + coverage_by_file = {} + + if coverage_file.exists(): + coverage_total, coverage_by_file = _parse_coverage_xml(coverage_file) + + return { + "label": label, + "tests_passed": tests_passed, + "tests_failed": tests_failed, + "coverage_total": coverage_total, + "coverage_by_file": coverage_by_file, + "junit_path": str(junit_file) if junit_file.exists() else None, + "coverage_path": str(coverage_file) if coverage_file.exists() else None, + "logs_path": str(logs_file), + "return_code": result.returncode, + } + + +def _parse_coverage_xml(coverage_file: Path) -> tuple[float, Dict[str, float]]: + """Parse coverage XML file.""" + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(coverage_file) + root = tree.getroot() + + # Debug: log the XML structure + logger.info(f"Coverage XML root tag: {root.tag}") + logger.info(f"Coverage XML root attribs: {root.attrib}") + + # Get overall coverage - try different formats + coverage_total = 0.0 + + # Modern pytest-cov uses 'coverage' as root element + if root.tag == 'coverage': + line_rate = root.get('line-rate', '0') + if line_rate != '0': + coverage_total = float(line_rate) * 100 + logger.info(f"Found line-rate in coverage root: {line_rate}") + else: + # Try finding coverage element nested + coverage_element = root.find('.//coverage') + if coverage_element is not None: + line_rate = coverage_element.get('line-rate', '0') + coverage_total = float(line_rate) * 100 + logger.info(f"Found coverage element with line-rate: {line_rate}") + + # If still no coverage found, try branches-valid attribute (alternative format) + if coverage_total == 0.0: + branches_valid = root.get('branches-valid', '0') + branches_covered = root.get('branches-covered', '0') + lines_valid = root.get('lines-valid', '0') + lines_covered = root.get('lines-covered', '0') + + if lines_valid != '0': + line_coverage = float(lines_covered) / float(lines_valid) + coverage_total = line_coverage * 100 + logger.info(f"Calculated coverage from lines: {lines_covered}/{lines_valid} = {coverage_total:.2f}%") + + # Get per-file coverage + coverage_by_file = {} + for class_elem in root.findall('.//class'): + filename = class_elem.get('filename', '') + line_rate = class_elem.get('line-rate', '0') + if filename: + coverage_by_file[filename] = float(line_rate) * 100 + + logger.info(f"Parsed coverage: {coverage_total}% total, {len(coverage_by_file)} files") + return coverage_total, coverage_by_file + + except Exception as e: + logger.warning(f"Failed to parse coverage XML: {e}") + return 0.0, {} \ No newline at end of file diff --git a/qualityflow/steps/select_input.py b/qualityflow/steps/select_input.py new file mode 100644 index 00000000..c274e8df --- /dev/null +++ b/qualityflow/steps/select_input.py @@ -0,0 +1,38 @@ +""" +Select input source specification step. +""" + +from typing import Annotated, Dict +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def select_input( + repo_url: str = "https://github.com/psf/requests.git", + ref: str = "main", + target_glob: str = "src/**/*.py", +) -> Annotated[Dict[str, str], "source_spec"]: + """ + Resolve source specification for test generation. + + Args: + repo_url: Repository URL to analyze + ref: Git reference (branch, tag, commit) + target_glob: Glob pattern for target files + + Returns: + Source specification dictionary + """ + logger.info(f"Selecting input source: {repo_url}@{ref}") + + spec = { + "repo_url": repo_url, + "ref": ref, + "target_glob": target_glob, + } + + logger.info(f"Source spec: {spec}") + return spec \ No newline at end of file From 3656f91c41c7d5be0ae424e16448dfbac55b0e92 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sun, 24 Aug 2025 22:30:28 +0200 Subject: [PATCH 2/8] Formattingg --- qualityflow/examples/toy_lib/__init__.py | 2 +- qualityflow/examples/toy_lib/calculator.py | 24 ++- qualityflow/examples/toy_lib/string_utils.py | 64 ++++--- qualityflow/pipelines/__init__.py | 2 +- .../pipelines/generate_and_evaluate.py | 45 ++--- qualityflow/run.py | 13 +- qualityflow/steps/__init__.py | 12 +- qualityflow/steps/analyze_code.py | 89 +++++---- qualityflow/steps/evaluate_coverage.py | 51 +++-- qualityflow/steps/fetch_source.py | 38 ++-- qualityflow/steps/gen_tests_agent.py | 180 ++++++++++-------- qualityflow/steps/gen_tests_baseline.py | 96 +++++----- qualityflow/steps/report.py | 136 +++++++------ qualityflow/steps/run_tests.py | 168 +++++++++------- qualityflow/steps/select_input.py | 11 +- 15 files changed, 534 insertions(+), 397 deletions(-) diff --git a/qualityflow/examples/toy_lib/__init__.py b/qualityflow/examples/toy_lib/__init__.py index c70599d5..8b91a8dd 100644 --- a/qualityflow/examples/toy_lib/__init__.py +++ b/qualityflow/examples/toy_lib/__init__.py @@ -2,4 +2,4 @@ QualityFlow toy library example for testing. """ -__version__ = "0.1.0" \ No newline at end of file +__version__ = "0.1.0" diff --git a/qualityflow/examples/toy_lib/calculator.py b/qualityflow/examples/toy_lib/calculator.py index c9ec644d..38bc9964 100644 --- a/qualityflow/examples/toy_lib/calculator.py +++ b/qualityflow/examples/toy_lib/calculator.py @@ -10,25 +10,33 @@ def __init__(self): """Initialize calculator with empty history.""" self.history = [] - def add(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]: + def add( + self, a: Union[int, float], b: Union[int, float] + ) -> Union[int, float]: """Add two numbers.""" result = a + b self.history.append(f"{a} + {b} = {result}") return result - def subtract(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]: + def subtract( + self, a: Union[int, float], b: Union[int, float] + ) -> Union[int, float]: """Subtract second number from first.""" result = a - b self.history.append(f"{a} - {b} = {result}") return result - def multiply(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]: + def multiply( + self, a: Union[int, float], b: Union[int, float] + ) -> Union[int, float]: """Multiply two numbers.""" result = a * b self.history.append(f"{a} * {b} = {result}") return result - def divide(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]: + def divide( + self, a: Union[int, float], b: Union[int, float] + ) -> Union[int, float]: """Divide first number by second.""" if b == 0: raise ValueError("Cannot divide by zero") @@ -36,9 +44,11 @@ def divide(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float self.history.append(f"{a} / {b} = {result}") return result - def power(self, base: Union[int, float], exponent: Union[int, float]) -> Union[int, float]: + def power( + self, base: Union[int, float], exponent: Union[int, float] + ) -> Union[int, float]: """Raise base to the power of exponent.""" - result = base ** exponent + result = base**exponent self.history.append(f"{base} ** {exponent} = {result}") return result @@ -72,4 +82,4 @@ def is_prime(n: int) -> bool: for i in range(3, int(n**0.5) + 1, 2): if n % i == 0: return False - return True \ No newline at end of file + return True diff --git a/qualityflow/examples/toy_lib/string_utils.py b/qualityflow/examples/toy_lib/string_utils.py index d842b500..276509ab 100644 --- a/qualityflow/examples/toy_lib/string_utils.py +++ b/qualityflow/examples/toy_lib/string_utils.py @@ -3,7 +3,7 @@ """ import re -from typing import List, Optional +from typing import List def reverse_string(s: str) -> str: @@ -17,13 +17,13 @@ def is_palindrome(s: str, ignore_case: bool = True) -> bool: """Check if a string is a palindrome.""" if not isinstance(s, str): raise TypeError("Input must be a string") - + # Clean the string - keep only alphanumeric characters - cleaned = re.sub(r'[^a-zA-Z0-9]', '', s) - + cleaned = re.sub(r"[^a-zA-Z0-9]", "", s) + if ignore_case: cleaned = cleaned.lower() - + return cleaned == cleaned[::-1] @@ -31,10 +31,10 @@ def count_words(text: str) -> int: """Count words in text.""" if not isinstance(text, str): raise TypeError("Input must be a string") - + if not text.strip(): return 0 - + words = text.split() return len(words) @@ -43,16 +43,16 @@ def capitalize_words(text: str) -> str: """Capitalize the first letter of each word.""" if not isinstance(text, str): raise TypeError("Input must be a string") - - return ' '.join(word.capitalize() for word in text.split()) + + return " ".join(word.capitalize() for word in text.split()) def extract_emails(text: str) -> List[str]: """Extract email addresses from text.""" if not isinstance(text, str): raise TypeError("Input must be a string") - - email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' + + email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b" return re.findall(email_pattern, text) @@ -62,59 +62,61 @@ def truncate_string(s: str, max_length: int, suffix: str = "...") -> str: raise TypeError("Input must be a string") if not isinstance(max_length, int) or max_length < 0: raise ValueError("max_length must be a non-negative integer") - + if len(s) <= max_length: return s - + if max_length <= len(suffix): return s[:max_length] - - return s[:max_length - len(suffix)] + suffix + + return s[: max_length - len(suffix)] + suffix class TextProcessor: """Text processing utility class.""" - + def __init__(self, default_encoding: str = "utf-8"): self.default_encoding = default_encoding self.processed_count = 0 - + def clean_text(self, text: str, remove_punctuation: bool = False) -> str: """Clean text by removing extra whitespace and optionally punctuation.""" if not isinstance(text, str): raise TypeError("Input must be a string") - + # Remove extra whitespace - cleaned = ' '.join(text.split()) - + cleaned = " ".join(text.split()) + if remove_punctuation: # Remove punctuation except spaces - cleaned = re.sub(r'[^\w\s]', '', cleaned) - + cleaned = re.sub(r"[^\w\s]", "", cleaned) + self.processed_count += 1 return cleaned - - def word_frequency(self, text: str, ignore_case: bool = True) -> dict[str, int]: + + def word_frequency( + self, text: str, ignore_case: bool = True + ) -> dict[str, int]: """Count word frequency in text.""" if not isinstance(text, str): raise TypeError("Input must be a string") - + words = text.split() if ignore_case: words = [word.lower() for word in words] - + frequency = {} for word in words: # Remove punctuation from word - clean_word = re.sub(r'[^\w]', '', word) + clean_word = re.sub(r"[^\w]", "", word) if clean_word: frequency[clean_word] = frequency.get(clean_word, 0) + 1 - + return frequency - + def get_stats(self) -> dict[str, int]: """Get processing statistics.""" return { "processed_count": self.processed_count, - "default_encoding": self.default_encoding - } \ No newline at end of file + "default_encoding": self.default_encoding, + } diff --git a/qualityflow/pipelines/__init__.py b/qualityflow/pipelines/__init__.py index 525d0f58..af93c207 100644 --- a/qualityflow/pipelines/__init__.py +++ b/qualityflow/pipelines/__init__.py @@ -2,4 +2,4 @@ from .generate_and_evaluate import generate_and_evaluate -__all__ = ["generate_and_evaluate"] \ No newline at end of file +__all__ = ["generate_and_evaluate"] diff --git a/qualityflow/pipelines/generate_and_evaluate.py b/qualityflow/pipelines/generate_and_evaluate.py index c50754e3..e359afb8 100644 --- a/qualityflow/pipelines/generate_and_evaluate.py +++ b/qualityflow/pipelines/generate_and_evaluate.py @@ -2,18 +2,15 @@ QualityFlow experiment pipeline for test generation and evaluation. """ -from typing import Annotated - -from zenml import pipeline -from zenml.logger import get_logger - -from steps.select_input import select_input -from steps.fetch_source import fetch_source from steps.analyze_code import analyze_code +from steps.fetch_source import fetch_source from steps.gen_tests_agent import gen_tests_agent from steps.gen_tests_baseline import gen_tests_baseline -from steps.run_tests import run_tests from steps.report import report +from steps.run_tests import run_tests +from steps.select_input import select_input +from zenml import pipeline +from zenml.logger import get_logger logger = get_logger(__name__) @@ -21,38 +18,36 @@ @pipeline(name="generate_and_evaluate") def generate_and_evaluate() -> None: """QualityFlow pipeline for generating and evaluating tests. - + Simple, focused pipeline: 1. Analyze code to find files needing tests - 2. Generate tests using LLM and baseline approaches + 2. Generate tests using LLM and baseline approaches 3. Run tests and measure coverage 4. Report results for comparison """ # Step 1: Resolve source specification spec = select_input() - + # Step 2: Fetch and materialize workspace workspace_dir, commit_sha = fetch_source(spec) - + # Step 3: Analyze and select code files - code_summary = analyze_code( - workspace_dir, commit_sha - ) - + code_summary = analyze_code(workspace_dir, commit_sha) + # Step 4: Generate tests using LLM agent - agent_tests_dir, prompt_used = gen_tests_agent( - workspace_dir, code_summary - ) - + agent_tests_dir, prompt_used = gen_tests_agent(workspace_dir, code_summary) + # Step 5: Generate baseline tests (optional) baseline_tests_dir = gen_tests_baseline(workspace_dir, code_summary) - + # Step 6: Run agent tests agent_results = run_tests(workspace_dir, agent_tests_dir, label="agent") - + # Step 7: Run baseline tests (if available) - baseline_results = run_tests(workspace_dir, baseline_tests_dir, label="baseline") - + baseline_results = run_tests( + workspace_dir, baseline_tests_dir, label="baseline" + ) + # Step 8: Generate comprehensive report (includes evaluation) report( workspace_dir, @@ -60,4 +55,4 @@ def generate_and_evaluate() -> None: prompt_used, agent_results, baseline_results, - ) \ No newline at end of file + ) diff --git a/qualityflow/run.py b/qualityflow/run.py index 11751350..c9ff7370 100644 --- a/qualityflow/run.py +++ b/qualityflow/run.py @@ -14,7 +14,7 @@ @click.command() @click.option( "--config", - "-c", + "-c", type=click.Path(exists=True, dir_okay=False), default=None, required=False, @@ -28,7 +28,7 @@ ) def main(config: str | None, no_cache: bool): """Run QualityFlow test generation and coverage analysis pipeline. - + Simple pipeline that generates tests using LLM, runs them, measures coverage, and compares results against baseline approaches. """ @@ -38,10 +38,11 @@ def main(config: str | None, no_cache: bool): chosen_config = config or str(default_config) try: - logger.info(f"Starting QualityFlow pipeline with config: {chosen_config}") + logger.info( + f"Starting QualityFlow pipeline with config: {chosen_config}" + ) pipeline_instance = generate_and_evaluate.with_options( - config_path=chosen_config, - enable_cache=not no_cache + config_path=chosen_config, enable_cache=not no_cache ) pipeline_instance() logger.info("QualityFlow pipeline completed successfully!") @@ -52,4 +53,4 @@ def main(config: str | None, no_cache: bool): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/qualityflow/steps/__init__.py b/qualityflow/steps/__init__.py index 70abca08..c6857ec7 100644 --- a/qualityflow/steps/__init__.py +++ b/qualityflow/steps/__init__.py @@ -1,21 +1,21 @@ """QualityFlow pipeline steps.""" -from .select_input import select_input -from .fetch_source import fetch_source from .analyze_code import analyze_code +from .evaluate_coverage import evaluate_coverage +from .fetch_source import fetch_source from .gen_tests_agent import gen_tests_agent from .gen_tests_baseline import gen_tests_baseline -from .run_tests import run_tests -from .evaluate_coverage import evaluate_coverage from .report import report +from .run_tests import run_tests +from .select_input import select_input __all__ = [ "select_input", "fetch_source", - "analyze_code", + "analyze_code", "gen_tests_agent", "gen_tests_baseline", "run_tests", "evaluate_coverage", "report", -] \ No newline at end of file +] diff --git a/qualityflow/steps/analyze_code.py b/qualityflow/steps/analyze_code.py index 0bfebd9c..3a8f5a14 100644 --- a/qualityflow/steps/analyze_code.py +++ b/qualityflow/steps/analyze_code.py @@ -2,23 +2,25 @@ Analyze and select code files for test generation. """ -import glob import ast +import glob import os +from enum import Enum from pathlib import Path -from typing import Annotated, Dict, List, Tuple +from typing import Annotated, Dict, List from zenml import step from zenml.logger import get_logger -from enum import Enum class SelectionStrategy(str, Enum): """Code file selection strategies.""" + LOW_COVERAGE = "low_coverage" CHANGED_FILES = "changed_files" ALL = "all" + logger = get_logger(__name__) @@ -32,80 +34,85 @@ def analyze_code( ) -> Annotated[Dict, "code_summary"]: """ Analyze workspace and select candidate files for test generation. - + Args: workspace_dir: Path to workspace directory commit_sha: Git commit SHA target_glob: Glob pattern for target files strategy: File selection strategy max_files: Maximum number of files to select - + Returns: Code summary dictionary containing selected files and metadata """ logger.info(f"Analyzing code in {workspace_dir} with strategy {strategy}") - + workspace_path = Path(workspace_dir) - + # Find all Python files matching glob pattern all_files = [] for pattern in target_glob.split(","): pattern = pattern.strip() - matched_files = glob.glob(str(workspace_path / pattern), recursive=True) + matched_files = glob.glob( + str(workspace_path / pattern), recursive=True + ) all_files.extend(matched_files) - + # Make paths relative to workspace relative_files = [ - os.path.relpath(f, workspace_dir) - for f in all_files - if f.endswith('.py') and os.path.isfile(f) + os.path.relpath(f, workspace_dir) + for f in all_files + if f.endswith(".py") and os.path.isfile(f) ] - + logger.info(f"Found {len(relative_files)} Python files") - + # Calculate complexity scores complexity_scores = {} valid_files = [] - + for file_path in relative_files: full_path = workspace_path / file_path try: - with open(full_path, 'r', encoding='utf-8') as f: + with open(full_path, "r", encoding="utf-8") as f: content = f.read() - + # Parse AST and calculate basic complexity tree = ast.parse(content) complexity = _calculate_complexity(tree) complexity_scores[file_path] = complexity valid_files.append(file_path) - + except (SyntaxError, UnicodeDecodeError) as e: logger.warning(f"Skipping {file_path} due to parsing error: {e}") continue - + # Select files based on strategy - selected_files = _select_files(valid_files, complexity_scores, strategy, max_files) - + selected_files = _select_files( + valid_files, complexity_scores, strategy, max_files + ) + code_summary = { "selected_files": selected_files, "total_files": len(valid_files), "selection_reason": f"Selected top {len(selected_files)} files using {strategy} strategy", - "complexity_scores": {f: complexity_scores[f] for f in selected_files} + "complexity_scores": {f: complexity_scores[f] for f in selected_files}, } - + logger.info(f"Selected {len(selected_files)} files: {selected_files}") - + return code_summary def _calculate_complexity(tree: ast.AST) -> float: """Calculate basic complexity score for an AST.""" + class ComplexityVisitor(ast.NodeVisitor): def __init__(self): self.complexity = 0 self.functions = 0 self.classes = 0 - + def visit_FunctionDef(self, node): self.functions += 1 self.complexity += 1 @@ -113,39 +120,43 @@ def visit_FunctionDef(self, node): if isinstance(child, (ast.If, ast.For, ast.While, ast.Try)): self.complexity += 1 self.generic_visit(node) - + def visit_ClassDef(self, node): self.classes += 1 self.complexity += 1 self.generic_visit(node) - + visitor = ComplexityVisitor() visitor.visit(tree) - + # Combine metrics into single score return visitor.complexity + visitor.functions * 0.5 + visitor.classes * 2 def _select_files( - files: List[str], - complexity_scores: Dict[str, float], - strategy: SelectionStrategy, - max_files: int + files: List[str], + complexity_scores: Dict[str, float], + strategy: SelectionStrategy, + max_files: int, ) -> List[str]: """Select files based on strategy.""" - + if strategy == SelectionStrategy.ALL: return files[:max_files] - + elif strategy == SelectionStrategy.LOW_COVERAGE: # Prioritize complex files that likely need more tests - sorted_files = sorted(files, key=lambda f: complexity_scores[f], reverse=True) + sorted_files = sorted( + files, key=lambda f: complexity_scores[f], reverse=True + ) return sorted_files[:max_files] - + elif strategy == SelectionStrategy.CHANGED_FILES: # For this demo, just return all files (in real implementation, would use git diff) - logger.warning("CHANGED_FILES strategy not fully implemented, falling back to ALL") + logger.warning( + "CHANGED_FILES strategy not fully implemented, falling back to ALL" + ) return files[:max_files] - + else: - raise ValueError(f"Unknown selection strategy: {strategy}") \ No newline at end of file + raise ValueError(f"Unknown selection strategy: {strategy}") diff --git a/qualityflow/steps/evaluate_coverage.py b/qualityflow/steps/evaluate_coverage.py index 9d384b10..f069bfb5 100644 --- a/qualityflow/steps/evaluate_coverage.py +++ b/qualityflow/steps/evaluate_coverage.py @@ -3,7 +3,8 @@ """ from typing import Annotated, Dict, Optional -from zenml import step, Model + +from zenml import step from zenml.logger import get_logger logger = get_logger(__name__) @@ -17,39 +18,55 @@ def evaluate_coverage( ) -> Annotated[Dict, "evaluation_metrics"]: """ Evaluate coverage metrics and compare agent vs baseline approaches. - + Args: agent_results: Test results from agent-generated tests baseline_results: Test results from baseline tests (optional) commit_sha: Current commit SHA - + Returns: Evaluation metrics dictionary with coverage comparison """ logger.info("Evaluating coverage metrics and computing deltas") - + # Extract agent metrics coverage_total_agent = agent_results.get("coverage_total", 0.0) tests_passed_agent = agent_results.get("tests_passed", 0) tests_failed_agent = agent_results.get("tests_failed", 0) - + total_tests_agent = tests_passed_agent + tests_failed_agent - pass_rate_agent = tests_passed_agent / total_tests_agent if total_tests_agent > 0 else 0.0 - + pass_rate_agent = ( + tests_passed_agent / total_tests_agent + if total_tests_agent > 0 + else 0.0 + ) + # Extract baseline metrics coverage_total_baseline = None if baseline_results and not baseline_results.get("skipped", False): coverage_total_baseline = baseline_results.get("coverage_total", 0.0) - + # Compare agent vs baseline coverage coverage_improvement = 0.0 if coverage_total_baseline is not None: coverage_improvement = coverage_total_agent - coverage_total_baseline - + # Analyze coverage quality - pass_rate_quality = "excellent" if pass_rate_agent > 0.95 else "good" if pass_rate_agent > 0.8 else "needs_improvement" - coverage_quality = "excellent" if coverage_total_agent > 80 else "good" if coverage_total_agent > 50 else "needs_improvement" - + pass_rate_quality = ( + "excellent" + if pass_rate_agent > 0.95 + else "good" + if pass_rate_agent > 0.8 + else "needs_improvement" + ) + coverage_quality = ( + "excellent" + if coverage_total_agent > 80 + else "good" + if coverage_total_agent > 50 + else "needs_improvement" + ) + evaluation_metrics = { "coverage_total_agent": coverage_total_agent, "coverage_total_baseline": coverage_total_baseline, @@ -62,7 +79,9 @@ def evaluate_coverage( "commit_sha": commit_sha, "files_analyzed": len(agent_results.get("coverage_by_file", {})), } - - logger.info(f"Evaluation complete: agent_coverage={coverage_total_agent:.2f}%, baseline_coverage={coverage_total_baseline or 0:.2f}%, improvement={coverage_improvement:+.2f}%") - - return evaluation_metrics \ No newline at end of file + + logger.info( + f"Evaluation complete: agent_coverage={coverage_total_agent:.2f}%, baseline_coverage={coverage_total_baseline or 0:.2f}%, improvement={coverage_improvement:+.2f}%" + ) + + return evaluation_metrics diff --git a/qualityflow/steps/fetch_source.py b/qualityflow/steps/fetch_source.py index c117f2d2..dfbfd609 100644 --- a/qualityflow/steps/fetch_source.py +++ b/qualityflow/steps/fetch_source.py @@ -2,8 +2,8 @@ Fetch source code workspace step. """ -import tempfile import subprocess +import tempfile from pathlib import Path from typing import Annotated, Dict, Tuple @@ -19,32 +19,41 @@ def fetch_source( ) -> Tuple[Annotated[Path, "workspace_dir"], Annotated[str, "commit_sha"]]: """ Fetch and materialize workspace from git repository. - + Args: source_spec: Source specification from select_input step - + Returns: Tuple of workspace directory path and commit SHA """ repo_url = source_spec["repo_url"] ref = source_spec["ref"] - + logger.info(f"Fetching source from {repo_url}@{ref}") - + # Create temporary workspace workspace_dir = tempfile.mkdtemp(prefix="qualityflow_workspace_") workspace_path = Path(workspace_dir) - + try: # Clone repository logger.info(f"Cloning {repo_url} to {workspace_dir}") subprocess.run( - ["git", "clone", "--depth", "1", "--branch", ref, repo_url, workspace_dir], + [ + "git", + "clone", + "--depth", + "1", + "--branch", + ref, + repo_url, + workspace_dir, + ], check=True, capture_output=True, text=True, ) - + # Get commit SHA result = subprocess.run( ["git", "rev-parse", "HEAD"], @@ -54,11 +63,13 @@ def fetch_source( text=True, ) commit_sha = result.stdout.strip() - - logger.info(f"Workspace ready at {workspace_dir}, commit: {commit_sha}") - + + logger.info( + f"Workspace ready at {workspace_dir}, commit: {commit_sha}" + ) + return Path(workspace_dir), commit_sha - + except subprocess.CalledProcessError as e: logger.error(f"Failed to fetch source: {e}") raise RuntimeError(f"Git operation failed: {e.stderr}") @@ -66,5 +77,6 @@ def fetch_source( logger.error(f"Unexpected error fetching source: {e}") # Clean up on error import shutil + shutil.rmtree(workspace_dir, ignore_errors=True) - raise \ No newline at end of file + raise diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py index 8ed37d31..9a918afc 100644 --- a/qualityflow/steps/gen_tests_agent.py +++ b/qualityflow/steps/gen_tests_agent.py @@ -3,22 +3,23 @@ """ import tempfile +from enum import Enum from pathlib import Path -from typing import Annotated, Dict, List, Tuple -from jinja2 import Template +from typing import Annotated, Dict, Tuple -from zenml import step +from jinja2 import Template +from zenml import log_metadata, step from zenml.logger import get_logger -from zenml import log_metadata -from enum import Enum class GenerationProvider(str, Enum): """LLM providers for test generation.""" + OPENAI = "openai" ANTHROPIC = "anthropic" FAKE = "fake" + logger = get_logger(__name__) @@ -31,12 +32,9 @@ def gen_tests_agent( prompt_path: str = "prompts/unit_test_v1.jinja", max_tests_per_file: int = 3, max_files: int = 10, -) -> Tuple[ - Annotated[Path, "agent_tests_dir"], - Annotated[str, "prompt_used"] -]: +) -> Tuple[Annotated[Path, "agent_tests_dir"], Annotated[str, "prompt_used"]]: """Generate tests using LLM agent. - + Args: workspace_dir: Path to workspace directory code_summary: Code analysis summary containing selected files @@ -45,86 +43,100 @@ def gen_tests_agent( prompt_path: Path to Jinja2 prompt template max_tests_per_file: Maximum tests to generate per file max_files: Maximum number of files to process (for speed control) - + Returns: Tuple of test directory and prompt used """ # Extract selected files from code summary selected_files = code_summary.get("selected_files", []) - + # Limit files if max_files is specified - files_to_process = selected_files[:max_files] if max_files > 0 else selected_files - logger.info(f"Generating tests for {len(files_to_process)}/{len(selected_files)} files using {provider}:{model}") - + files_to_process = ( + selected_files[:max_files] if max_files > 0 else selected_files + ) + logger.info( + f"Generating tests for {len(files_to_process)}/{len(selected_files)} files using {provider}:{model}" + ) + # Create tests directory tests_dir = tempfile.mkdtemp(prefix="qualityflow_agent_tests_") tests_path = Path(tests_dir) - + # Load prompt template workspace_path = Path(workspace_dir) prompt_file = workspace_path / prompt_path - + if prompt_file.exists(): - with open(prompt_file, 'r') as f: + with open(prompt_file, "r") as f: prompt_template = f.read() else: # Use default template if file doesn't exist prompt_template = _get_default_prompt_template() logger.info(f"Using default prompt template, {prompt_path} not found") - + template = Template(prompt_template) - + total_tokens_in = 0 total_tokens_out = 0 materialized_prompts = {} # Store materialized prompts per file - + for file_path in files_to_process: logger.info(f"Generating tests for {file_path}") - + # Read source file full_file_path = workspace_path / file_path - with open(full_file_path, 'r') as f: + with open(full_file_path, "r") as f: source_code = f.read() - + # Render prompt materialized_prompt = template.render( file_path=file_path, source_code=source_code, max_tests=max_tests_per_file, - complexity_score=code_summary.get("complexity_scores", {}).get(file_path, 0) + complexity_score=code_summary.get("complexity_scores", {}).get( + file_path, 0 + ), ) - + # Store the materialized prompt for this file materialized_prompts[file_path] = materialized_prompt - + # Generate tests using provider if provider == GenerationProvider.FAKE: - generated_tests, tokens = _generate_fake_tests(file_path, source_code, max_tests_per_file) + generated_tests, tokens = _generate_fake_tests( + file_path, source_code, max_tests_per_file + ) elif provider == GenerationProvider.OPENAI: - generated_tests, tokens = _generate_openai_tests(materialized_prompt, model) + generated_tests, tokens = _generate_openai_tests( + materialized_prompt, model + ) elif provider == GenerationProvider.ANTHROPIC: - generated_tests, tokens = _generate_anthropic_tests(materialized_prompt, model) + generated_tests, tokens = _generate_anthropic_tests( + materialized_prompt, model + ) else: raise ValueError(f"Unsupported provider: {provider}") - + total_tokens_in += tokens.get("tokens_in", 0) total_tokens_out += tokens.get("tokens_out", 0) - + # Save generated tests test_file_name = f"test_{Path(file_path).stem}.py" test_file_path = tests_path / test_file_name - - with open(test_file_path, 'w') as f: + + with open(test_file_path, "w") as f: f.write(generated_tests) - + logger.info(f"Generated tests saved to {test_file_path}") - + # Log comprehensive metadata including materialized prompts metadata = { "token_usage": { "tokens_in": total_tokens_in, "tokens_out": total_tokens_out, - "cost_estimate": _estimate_cost(total_tokens_in, total_tokens_out, provider, model), + "cost_estimate": _estimate_cost( + total_tokens_in, total_tokens_out, provider, model + ), }, "config": { "provider": provider.value, @@ -136,13 +148,15 @@ def gen_tests_agent( "materialized_prompts": materialized_prompts, "prompt_template": prompt_template, } - + log_metadata(metadata) - logger.info(f"Test generation complete. Files: {len(files_to_process)}, Tokens: {total_tokens_in} in / {total_tokens_out} out") - + logger.info( + f"Test generation complete. Files: {len(files_to_process)}, Tokens: {total_tokens_in} in / {total_tokens_out} out" + ) + # Create a better prompt summary for the report prompt_summary = f"Template: {prompt_path}\nProvider: {provider.value}\nModel: {model}\nFiles processed: {len(files_to_process)}" - + # Return Path object - ZenML will automatically materialize the folder return Path(tests_dir), prompt_summary @@ -168,11 +182,13 @@ def _get_default_prompt_template() -> str: """ -def _generate_fake_tests(file_path: str, source_code: str, max_tests: int) -> Tuple[str, Dict]: +def _generate_fake_tests( + file_path: str, source_code: str, max_tests: int +) -> Tuple[str, Dict]: """Generate fake/mock tests for development/testing.""" # Create a simple module name from file path - module_name = file_path.replace('/', '.').replace('.py', '') - + module_name = file_path.replace("/", ".").replace(".py", "") + test_content = f'''""" Generated tests for {file_path} """ @@ -181,7 +197,7 @@ def _generate_fake_tests(file_path: str, source_code: str, max_tests: int) -> Tu import unittest from unittest.mock import Mock, patch, MagicMock -class Test{file_path.split('/')[-1].replace('.py', '').title()}(unittest.TestCase): +class Test{file_path.split("/")[-1].replace(".py", "").title()}(unittest.TestCase): """Auto-generated test class for {file_path}.""" def test_module_import(self): @@ -222,7 +238,7 @@ def test_coverage_target(self): if __name__ == "__main__": unittest.main() ''' - + tokens = {"tokens_in": 100, "tokens_out": 50} return test_content, tokens @@ -230,31 +246,35 @@ def test_coverage_target(self): def _generate_openai_tests(prompt: str, model: str) -> Tuple[str, Dict]: """Generate tests using OpenAI API.""" try: - import openai import os - + + import openai + # Get API key from environment api_key = os.getenv("OPENAI_API_KEY") if not api_key: logger.warning("OPENAI_API_KEY not found, using fake tests") return _generate_fake_tests("openai_file", "mock_code", 3) - + client = openai.OpenAI(api_key=api_key) - + # Call OpenAI API response = client.chat.completions.create( model=model, messages=[ - {"role": "system", "content": "You are a Python test generation expert. Generate comprehensive unit tests for the given code."}, - {"role": "user", "content": prompt} + { + "role": "system", + "content": "You are a Python test generation expert. Generate comprehensive unit tests for the given code.", + }, + {"role": "user", "content": prompt}, ], max_tokens=2000, - temperature=0.1 + temperature=0.1, ) - + # Extract test code from response generated_content = response.choices[0].message.content - + # Try to extract Python code blocks if "```python" in generated_content: start = generated_content.find("```python") + 9 @@ -267,16 +287,18 @@ def _generate_openai_tests(prompt: str, model: str) -> Tuple[str, Dict]: else: # Use the whole response if no code blocks found test_content = generated_content.strip() - + # Token usage for cost estimation tokens = { "tokens_in": response.usage.prompt_tokens, - "tokens_out": response.usage.completion_tokens + "tokens_out": response.usage.completion_tokens, } - - logger.info(f"Generated tests using OpenAI {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out") + + logger.info( + f"Generated tests using OpenAI {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out" + ) return test_content, tokens - + except ImportError: logger.warning("OpenAI library not installed, using fake tests") return _generate_fake_tests("openai_file", "mock_code", 3) @@ -289,30 +311,34 @@ def _generate_openai_tests(prompt: str, model: str) -> Tuple[str, Dict]: def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]: """Generate tests using Anthropic API.""" try: - import anthropic import os - + + import anthropic + # Get API key from environment api_key = os.getenv("ANTHROPIC_API_KEY") if not api_key: logger.warning("ANTHROPIC_API_KEY not found, using fake tests") return _generate_fake_tests("anthropic_file", "mock_code", 3) - + client = anthropic.Anthropic(api_key=api_key) - + # Call Anthropic API response = client.messages.create( model=model, max_tokens=2000, temperature=0.1, messages=[ - {"role": "user", "content": f"You are a Python test generation expert. Generate comprehensive unit tests for the given code.\n\n{prompt}"} - ] + { + "role": "user", + "content": f"You are a Python test generation expert. Generate comprehensive unit tests for the given code.\n\n{prompt}", + } + ], ) - + # Extract test content from response generated_content = response.content[0].text - + # Try to extract Python code blocks if "```python" in generated_content: start = generated_content.find("```python") + 9 @@ -325,16 +351,18 @@ def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]: else: # Use the whole response if no code blocks found test_content = generated_content.strip() - + # Token usage for cost estimation tokens = { "tokens_in": response.usage.input_tokens, - "tokens_out": response.usage.output_tokens + "tokens_out": response.usage.output_tokens, } - - logger.info(f"Generated tests using Anthropic {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out") + + logger.info( + f"Generated tests using Anthropic {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out" + ) return test_content, tokens - + except ImportError: logger.warning("Anthropic library not installed, using fake tests") return _generate_fake_tests("anthropic_file", "mock_code", 3) @@ -344,7 +372,9 @@ def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]: return _generate_fake_tests("anthropic_file", "mock_code", 3) -def _estimate_cost(tokens_in: int, tokens_out: int, provider: GenerationProvider, model: str) -> float: +def _estimate_cost( + tokens_in: int, tokens_out: int, provider: GenerationProvider, model: str +) -> float: """Estimate cost based on token usage.""" # Rough cost estimates (would need real pricing) if provider == GenerationProvider.OPENAI: @@ -355,4 +385,4 @@ def _estimate_cost(tokens_in: int, tokens_out: int, provider: GenerationProvider elif provider == GenerationProvider.ANTHROPIC: return (tokens_in * 0.000008) + (tokens_out * 0.000024) else: - return 0.0 \ No newline at end of file + return 0.0 diff --git a/qualityflow/steps/gen_tests_baseline.py b/qualityflow/steps/gen_tests_baseline.py index 68a0a4e4..a3197748 100644 --- a/qualityflow/steps/gen_tests_baseline.py +++ b/qualityflow/steps/gen_tests_baseline.py @@ -2,8 +2,8 @@ Generate baseline/skeleton tests using heuristics. """ -import tempfile import ast +import tempfile from pathlib import Path from typing import Annotated, Dict, List, Optional @@ -22,65 +22,71 @@ def gen_tests_baseline( ) -> Annotated[Optional[Path], "baseline_tests_dir"]: """ Generate baseline/skeleton tests using heuristic analysis. - + Args: workspace_dir: Path to workspace directory code_summary: Code analysis summary containing selected files enabled: Whether baseline generation is enabled max_files: Maximum number of files to process - + Returns: Path to baseline tests directory, or None if disabled """ if not enabled: logger.info("Baseline test generation disabled") return None - + # Extract selected files from code summary selected_files = code_summary.get("selected_files", []) - + # Limit files if max_files is specified - files_to_process = selected_files[:max_files] if max_files > 0 else selected_files - logger.info(f"Generating baseline tests for {len(files_to_process)}/{len(selected_files)} files") - + files_to_process = ( + selected_files[:max_files] if max_files > 0 else selected_files + ) + logger.info( + f"Generating baseline tests for {len(files_to_process)}/{len(selected_files)} files" + ) + # Create baseline tests directory tests_dir = tempfile.mkdtemp(prefix="qualityflow_baseline_tests_") tests_path = Path(tests_dir) - + workspace_path = Path(workspace_dir) - + for file_path in files_to_process: logger.info(f"Generating baseline tests for {file_path}") - + # Read and parse source file full_file_path = workspace_path / file_path - with open(full_file_path, 'r') as f: + with open(full_file_path, "r") as f: source_code = f.read() - + try: tree = ast.parse(source_code) - + # Extract functions and classes functions, classes = _extract_testable_items(tree) - + # Generate skeleton tests - test_content = _generate_skeleton_tests(file_path, functions, classes) - + test_content = _generate_skeleton_tests( + file_path, functions, classes + ) + # Save baseline tests test_file_name = f"test_{Path(file_path).stem}_baseline.py" test_file_path = tests_path / test_file_name - - with open(test_file_path, 'w') as f: + + with open(test_file_path, "w") as f: f.write(test_content) - + logger.info(f"Baseline tests saved to {test_file_path}") - + except SyntaxError as e: logger.warning(f"Skipping {file_path} due to syntax error: {e}") continue - + logger.info("Baseline test generation complete") - + # Return Path object - ZenML will automatically materialize the folder return Path(tests_dir) @@ -89,23 +95,25 @@ def _extract_testable_items(tree: ast.AST) -> tuple[List[str], List[str]]: """Extract function and class names from AST.""" functions = [] classes = [] - + for node in ast.walk(tree): if isinstance(node, ast.FunctionDef): # Skip private functions (starting with _) - if not node.name.startswith('_'): + if not node.name.startswith("_"): functions.append(node.name) elif isinstance(node, ast.ClassDef): # Skip private classes - if not node.name.startswith('_'): + if not node.name.startswith("_"): classes.append(node.name) - + return functions, classes -def _generate_skeleton_tests(file_path: str, functions: List[str], classes: List[str]) -> str: +def _generate_skeleton_tests( + file_path: str, functions: List[str], classes: List[str] +) -> str: """Generate skeleton test content.""" - + # Create imports section imports = f'''""" Baseline/skeleton tests for {file_path} @@ -116,17 +124,19 @@ def _generate_skeleton_tests(file_path: str, functions: List[str], classes: List import unittest from unittest.mock import Mock, patch ''' - + # Try to determine import path from file path - module_path = file_path.replace('/', '.').replace('.py', '') - if module_path.startswith('src.'): + module_path = file_path.replace("/", ".").replace(".py", "") + if module_path.startswith("src."): module_path = module_path[4:] # Remove 'src.' prefix - + if functions or classes: - imports += f"# from {module_path} import {', '.join(functions + classes)}\n\n" + imports += ( + f"# from {module_path} import {', '.join(functions + classes)}\n\n" + ) else: imports += f"# from {module_path} import *\n\n" - + # Generate function tests function_tests = "" for func_name in functions: @@ -141,7 +151,7 @@ def test_{func_name}_error_cases(): # TODO: Test error conditions for {func_name} pass ''' - + # Generate class tests class_tests = "" for class_name in classes: @@ -164,7 +174,7 @@ def test_{class_name.lower()}_methods(self): # TODO: Test class methods pass ''' - + # Add default test if no functions or classes found if not functions and not classes: default_test = ''' @@ -177,14 +187,14 @@ def test_module_imports(self): pass ''' class_tests += default_test - + # Combine all parts test_content = imports + function_tests + class_tests - + # Add main block - test_content += ''' + test_content += """ if __name__ == "__main__": unittest.main() -''' - - return test_content \ No newline at end of file +""" + + return test_content diff --git a/qualityflow/steps/report.py b/qualityflow/steps/report.py index 141ecda2..628d5d07 100644 --- a/qualityflow/steps/report.py +++ b/qualityflow/steps/report.py @@ -3,9 +3,9 @@ """ import tempfile +from datetime import datetime from pathlib import Path from typing import Annotated, Dict, Optional -from datetime import datetime from zenml import step from zenml.logger import get_logger @@ -24,25 +24,29 @@ def report( ) -> Annotated[MarkdownString, "final_report"]: """ Generate comprehensive markdown report for pipeline execution. - + Args: workspace_dir: Workspace directory path commit_sha: Git commit SHA prompt_used: Prompt template used agent_results: Agent test results baseline_results: Baseline test results (optional) - + Returns: Path to generated markdown report """ logger.info("Generating pipeline execution report") - + # Create report file - report_file = Path(tempfile.mkdtemp(prefix="qualityflow_report_")) / "report.md" - + report_file = ( + Path(tempfile.mkdtemp(prefix="qualityflow_report_")) / "report.md" + ) + # Evaluate coverage metrics first - evaluation_metrics = _evaluate_coverage_metrics(agent_results, baseline_results, commit_sha) - + evaluation_metrics = _evaluate_coverage_metrics( + agent_results, baseline_results, commit_sha + ) + # Generate report content report_content = _generate_report_content( workspace_dir, @@ -52,13 +56,13 @@ def report( baseline_results, evaluation_metrics, ) - + # Write report file - with open(report_file, 'w') as f: + with open(report_file, "w") as f: f.write(report_content) - + logger.info(f"Report generated: {report_file}") - + # Return as MarkdownString for dashboard visualization return MarkdownString(report_content) @@ -69,27 +73,43 @@ def _evaluate_coverage_metrics( commit_sha: str, ) -> Dict: """Evaluate coverage metrics and compare agent vs baseline approaches.""" - + # Extract agent metrics - use actual values from test results coverage_total_agent = agent_results.get("coverage_total", 0.0) tests_passed_agent = agent_results.get("tests_passed", 0) tests_failed_agent = agent_results.get("tests_failed", 0) - + total_tests_agent = tests_passed_agent + tests_failed_agent - pass_rate_agent = tests_passed_agent / total_tests_agent if total_tests_agent > 0 else 0.0 - + pass_rate_agent = ( + tests_passed_agent / total_tests_agent + if total_tests_agent > 0 + else 0.0 + ) + # Extract baseline metrics coverage_total_baseline = 0.0 if baseline_results and not baseline_results.get("skipped", False): coverage_total_baseline = baseline_results.get("coverage_total", 0.0) - + # Compare agent vs baseline coverage coverage_improvement = coverage_total_agent - coverage_total_baseline - + # Analyze coverage quality - pass_rate_quality = "excellent" if pass_rate_agent > 0.95 else "good" if pass_rate_agent > 0.8 else "needs_improvement" - coverage_quality = "excellent" if coverage_total_agent > 80 else "good" if coverage_total_agent > 50 else "needs_improvement" - + pass_rate_quality = ( + "excellent" + if pass_rate_agent > 0.95 + else "good" + if pass_rate_agent > 0.8 + else "needs_improvement" + ) + coverage_quality = ( + "excellent" + if coverage_total_agent > 80 + else "good" + if coverage_total_agent > 50 + else "needs_improvement" + ) + return { "coverage_total_agent": coverage_total_agent, "coverage_total_baseline": coverage_total_baseline, @@ -113,9 +133,9 @@ def _generate_report_content( evaluation_metrics: Dict, ) -> str: """Generate markdown report content.""" - + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - + # Header report = f"""# QualityFlow Pipeline Report @@ -126,70 +146,74 @@ def _generate_report_content( ## Executive Summary """ - + # Executive summary coverage_agent = evaluation_metrics.get("coverage_total_agent", 0.0) coverage_baseline = evaluation_metrics.get("coverage_total_baseline", 0.0) improvement = evaluation_metrics.get("coverage_improvement", 0.0) quality = evaluation_metrics.get("coverage_quality", "unknown") - - quality_emoji = "๐ŸŸข" if quality == "excellent" else "๐ŸŸก" if quality == "good" else "๐Ÿ”ด" - improvement_emoji = "๐Ÿ“ˆ" if improvement > 0 else "๐Ÿ“‰" if improvement < 0 else "โžก๏ธ" - + + quality_emoji = ( + "๐ŸŸข" if quality == "excellent" else "๐ŸŸก" if quality == "good" else "๐Ÿ”ด" + ) + improvement_emoji = ( + "๐Ÿ“ˆ" if improvement > 0 else "๐Ÿ“‰" if improvement < 0 else "โžก๏ธ" + ) + report += f"""{quality_emoji} **Coverage Quality**: {quality.upper()} {improvement_emoji} **Agent vs Baseline**: {coverage_agent:.2f}% vs {coverage_baseline:.2f}% ({improvement:+.2f}%) -๐Ÿงช **Tests**: {agent_results.get('tests_passed', 0)} passed, {agent_results.get('tests_failed', 0)} failed -๐Ÿ“ **Files**: {evaluation_metrics.get('files_analyzed', 0)} analyzed +๐Ÿงช **Tests**: {agent_results.get("tests_passed", 0)} passed, {agent_results.get("tests_failed", 0)} failed +๐Ÿ“ **Files**: {evaluation_metrics.get("files_analyzed", 0)} analyzed """ - + # Agent results section report += """## Agent Test Results """ - + if agent_results.get("skipped", False): report += "Agent tests were skipped.\n\n" else: - report += f"""- **Tests Passed**: {agent_results.get('tests_passed', 0)} -- **Tests Failed**: {agent_results.get('tests_failed', 0)} -- **Pass Rate**: {evaluation_metrics.get('pass_rate_agent', 0.0):.1%} -- **Coverage**: {agent_results.get('coverage_total', 0.0):.2f}% -- **JUnit Report**: `{agent_results.get('junit_path', 'N/A')}` -- **Coverage Report**: `{agent_results.get('coverage_path', 'N/A')}` -- **Logs**: `{agent_results.get('logs_path', 'N/A')}` + report += f"""- **Tests Passed**: {agent_results.get("tests_passed", 0)} +- **Tests Failed**: {agent_results.get("tests_failed", 0)} +- **Pass Rate**: {evaluation_metrics.get("pass_rate_agent", 0.0):.1%} +- **Coverage**: {agent_results.get("coverage_total", 0.0):.2f}% +- **JUnit Report**: `{agent_results.get("junit_path", "N/A")}` +- **Coverage Report**: `{agent_results.get("coverage_path", "N/A")}` +- **Logs**: `{agent_results.get("logs_path", "N/A")}` """ - + # Baseline results section (if available) if baseline_results and not baseline_results.get("skipped", False): report += """## Baseline Test Results """ - report += f"""- **Tests Passed**: {baseline_results.get('tests_passed', 0)} -- **Tests Failed**: {baseline_results.get('tests_failed', 0)} -- **Coverage**: {baseline_results.get('coverage_total', 0.0):.2f}% -- **JUnit Report**: `{baseline_results.get('junit_path', 'N/A')}` -- **Coverage Report**: `{baseline_results.get('coverage_path', 'N/A')}` + report += f"""- **Tests Passed**: {baseline_results.get("tests_passed", 0)} +- **Tests Failed**: {baseline_results.get("tests_failed", 0)} +- **Coverage**: {baseline_results.get("coverage_total", 0.0):.2f}% +- **JUnit Report**: `{baseline_results.get("junit_path", "N/A")}` +- **Coverage Report**: `{baseline_results.get("coverage_path", "N/A")}` """ - + # Evaluation metrics section report += """## Coverage Analysis """ - + pass_rate = evaluation_metrics.get("pass_rate_agent", 0.0) pass_quality = evaluation_metrics.get("pass_rate_quality", "unknown") - + report += f"""- **Agent Coverage**: {coverage_agent:.2f}% ({quality}) - **Baseline Coverage**: {coverage_baseline:.2f}% - **Improvement**: {improvement:+.2f}% - **Test Pass Rate**: {pass_rate:.1%} ({pass_quality}) -- **Files Analyzed**: {evaluation_metrics.get('files_analyzed', 0)} +- **Files Analyzed**: {evaluation_metrics.get("files_analyzed", 0)} """ - + # Recommendations section report += """## Recommendations @@ -200,14 +224,14 @@ def _generate_report_content( report += "๐Ÿ‘ **Good coverage.** Consider tweaking prompts or selection strategy for improvement.\n" else: report += "โš ๏ธ **Coverage needs improvement.** Try different prompts, models, or increase max_tests_per_file.\n" - + if improvement > 5: report += "๐Ÿ“ˆ **Agent significantly outperforms baseline** - LLM approach is working well.\n" elif improvement > 0: report += "๐Ÿ“Š **Agent slightly better than baseline** - room for optimization.\n" else: report += "๐Ÿ“‰ **Baseline performs as well or better** - review agent configuration.\n" - + # Configuration section report += """## Configuration @@ -220,7 +244,7 @@ def _generate_report_content( ### File Coverage Details """ - + coverage_by_file = agent_results.get("coverage_by_file", {}) if coverage_by_file: report += "| File | Coverage |\n|------|----------|\n" @@ -228,11 +252,11 @@ def _generate_report_content( report += f"| `{file_path}` | {coverage_pct:.1f}% |\n" else: report += "No file-level coverage data available.\n" - + report += """ --- *Generated by QualityFlow - Production-ready test generation with ZenML* """ - - return report \ No newline at end of file + + return report diff --git a/qualityflow/steps/run_tests.py b/qualityflow/steps/run_tests.py index a4d12385..4c8697f3 100644 --- a/qualityflow/steps/run_tests.py +++ b/qualityflow/steps/run_tests.py @@ -2,9 +2,9 @@ Run tests and collect coverage metrics. """ +import shutil import subprocess import tempfile -import shutil from pathlib import Path from typing import Annotated, Dict, Optional @@ -21,12 +21,12 @@ def run_tests( label: str = "tests", ) -> Annotated[Dict, "test_results"]: """Run tests and collect coverage metrics. - + Args: workspace_dir: Path to workspace directory tests_dir: Path object to tests directory (None if no tests) label: Label for this test run - + Returns: Dictionary containing test results and metrics """ @@ -43,46 +43,54 @@ def run_tests( "logs_path": None, "skipped": True, } - + logger.info(f"Running {label} tests from {tests_dir}") - + # Create output directory for this test run output_dir = tempfile.mkdtemp(prefix=f"qualityflow_{label}_results_") output_path = Path(output_dir) - + junit_file = output_path / "junit.xml" coverage_file = output_path / "coverage.xml" logs_file = output_path / "test_logs.txt" - + # Copy tests to workspace (pytest needs them in PYTHONPATH) workspace_tests_dir = Path(workspace_dir) / f"tests_{label}" if workspace_tests_dir.exists(): shutil.rmtree(workspace_tests_dir) shutil.copytree(tests_dir, workspace_tests_dir) - + try: # Run pytest with coverage pytest_cmd = [ - "python", "-m", "pytest", + "python", + "-m", + "pytest", str(workspace_tests_dir), - "--junitxml", str(junit_file), - "--cov", str(workspace_dir), - "--cov-report", f"xml:{coverage_file}", - "--cov-report", "term", - "-v" + "--junitxml", + str(junit_file), + "--cov", + str(workspace_dir), + "--cov-report", + f"xml:{coverage_file}", + "--cov-report", + "term", + "-v", ] - + logger.info(f"Running command: {' '.join(pytest_cmd)}") logger.info(f"Working directory: {workspace_dir}") logger.info(f"Test directory: {workspace_tests_dir}") - + # Debug: list test files if workspace_tests_dir.exists(): test_files = list(workspace_tests_dir.glob("*.py")) logger.info(f"Test files found: {[f.name for f in test_files]}") else: - logger.warning(f"Test directory does not exist: {workspace_tests_dir}") - + logger.warning( + f"Test directory does not exist: {workspace_tests_dir}" + ) + result = subprocess.run( pytest_cmd, cwd=str(workspace_dir), @@ -90,32 +98,34 @@ def run_tests( text=True, timeout=300, # 5 minute timeout ) - + # Save logs and also log to console for debugging - with open(logs_file, 'w') as f: + with open(logs_file, "w") as f: f.write(f"Command: {' '.join(pytest_cmd)}\n") f.write(f"Return code: {result.returncode}\n\n") f.write("STDOUT:\n") f.write(result.stdout) - f.write("\nSTDERR:\n") + f.write("\nSTDERR:\n") f.write(result.stderr) - + # Also log the pytest output for debugging logger.info(f"Pytest return code: {result.returncode}") if result.stdout: logger.info(f"Pytest stdout: {result.stdout}") if result.stderr: logger.info(f"Pytest stderr: {result.stderr}") - + # Parse results test_results = _parse_test_results( result, junit_file, coverage_file, logs_file, label ) - - logger.info(f"Test run complete for {label}: {test_results['tests_passed']} passed, {test_results['tests_failed']} failed, {test_results['coverage_total']:.2f}% coverage") - + + logger.info( + f"Test run complete for {label}: {test_results['tests_passed']} passed, {test_results['tests_failed']} failed, {test_results['coverage_total']:.2f}% coverage" + ) + return test_results - + except subprocess.TimeoutExpired: logger.error(f"Test run for {label} timed out after 5 minutes") return { @@ -125,11 +135,13 @@ def run_tests( "coverage_total": 0.0, "coverage_by_file": {}, "junit_path": str(junit_file) if junit_file.exists() else None, - "coverage_path": str(coverage_file) if coverage_file.exists() else None, + "coverage_path": str(coverage_file) + if coverage_file.exists() + else None, "logs_path": str(logs_file), "error": "Test execution timed out", } - + except Exception as e: logger.error(f"Failed to run tests for {label}: {e}") return { @@ -139,11 +151,13 @@ def run_tests( "coverage_total": 0.0, "coverage_by_file": {}, "junit_path": str(junit_file) if junit_file.exists() else None, - "coverage_path": str(coverage_file) if coverage_file.exists() else None, + "coverage_path": str(coverage_file) + if coverage_file.exists() + else None, "logs_path": str(logs_file) if logs_file.exists() else None, "error": str(e), } - + finally: # Clean up copied tests if workspace_tests_dir.exists(): @@ -153,41 +167,41 @@ def run_tests( def _parse_test_results( result: subprocess.CompletedProcess, junit_file: Path, - coverage_file: Path, + coverage_file: Path, logs_file: Path, label: str, ) -> Dict: """Parse test execution results.""" - + # Parse pytest output for basic stats tests_passed = 0 tests_failed = 0 - + if result.stdout: - lines = result.stdout.split('\n') + lines = result.stdout.split("\n") for line in lines: - if ' passed' in line and ' failed' in line: + if " passed" in line and " failed" in line: # Line like "2 failed, 3 passed in 1.23s" parts = line.split() for i, part in enumerate(parts): - if part == 'passed' and i > 0: - tests_passed = int(parts[i-1]) - elif part == 'failed' and i > 0: - tests_failed = int(parts[i-1]) - elif ' passed' in line and 'failed' not in line: + if part == "passed" and i > 0: + tests_passed = int(parts[i - 1]) + elif part == "failed" and i > 0: + tests_failed = int(parts[i - 1]) + elif " passed" in line and "failed" not in line: # Line like "5 passed in 1.23s" parts = line.split() for i, part in enumerate(parts): - if part == 'passed' and i > 0: - tests_passed = int(parts[i-1]) - + if part == "passed" and i > 0: + tests_passed = int(parts[i - 1]) + # Parse coverage from XML if available coverage_total = 0.0 coverage_by_file = {} - + if coverage_file.exists(): coverage_total, coverage_by_file = _parse_coverage_xml(coverage_file) - + return { "label": label, "tests_passed": tests_passed, @@ -195,7 +209,9 @@ def _parse_test_results( "coverage_total": coverage_total, "coverage_by_file": coverage_by_file, "junit_path": str(junit_file) if junit_file.exists() else None, - "coverage_path": str(coverage_file) if coverage_file.exists() else None, + "coverage_path": str(coverage_file) + if coverage_file.exists() + else None, "logs_path": str(logs_file), "return_code": result.returncode, } @@ -205,54 +221,60 @@ def _parse_coverage_xml(coverage_file: Path) -> tuple[float, Dict[str, float]]: """Parse coverage XML file.""" try: import xml.etree.ElementTree as ET - + tree = ET.parse(coverage_file) root = tree.getroot() - + # Debug: log the XML structure logger.info(f"Coverage XML root tag: {root.tag}") logger.info(f"Coverage XML root attribs: {root.attrib}") - + # Get overall coverage - try different formats coverage_total = 0.0 - + # Modern pytest-cov uses 'coverage' as root element - if root.tag == 'coverage': - line_rate = root.get('line-rate', '0') - if line_rate != '0': + if root.tag == "coverage": + line_rate = root.get("line-rate", "0") + if line_rate != "0": coverage_total = float(line_rate) * 100 logger.info(f"Found line-rate in coverage root: {line_rate}") else: # Try finding coverage element nested - coverage_element = root.find('.//coverage') + coverage_element = root.find(".//coverage") if coverage_element is not None: - line_rate = coverage_element.get('line-rate', '0') + line_rate = coverage_element.get("line-rate", "0") coverage_total = float(line_rate) * 100 - logger.info(f"Found coverage element with line-rate: {line_rate}") - + logger.info( + f"Found coverage element with line-rate: {line_rate}" + ) + # If still no coverage found, try branches-valid attribute (alternative format) if coverage_total == 0.0: - branches_valid = root.get('branches-valid', '0') - branches_covered = root.get('branches-covered', '0') - lines_valid = root.get('lines-valid', '0') - lines_covered = root.get('lines-covered', '0') - - if lines_valid != '0': + branches_valid = root.get("branches-valid", "0") + branches_covered = root.get("branches-covered", "0") + lines_valid = root.get("lines-valid", "0") + lines_covered = root.get("lines-covered", "0") + + if lines_valid != "0": line_coverage = float(lines_covered) / float(lines_valid) coverage_total = line_coverage * 100 - logger.info(f"Calculated coverage from lines: {lines_covered}/{lines_valid} = {coverage_total:.2f}%") - + logger.info( + f"Calculated coverage from lines: {lines_covered}/{lines_valid} = {coverage_total:.2f}%" + ) + # Get per-file coverage coverage_by_file = {} - for class_elem in root.findall('.//class'): - filename = class_elem.get('filename', '') - line_rate = class_elem.get('line-rate', '0') + for class_elem in root.findall(".//class"): + filename = class_elem.get("filename", "") + line_rate = class_elem.get("line-rate", "0") if filename: coverage_by_file[filename] = float(line_rate) * 100 - - logger.info(f"Parsed coverage: {coverage_total}% total, {len(coverage_by_file)} files") + + logger.info( + f"Parsed coverage: {coverage_total}% total, {len(coverage_by_file)} files" + ) return coverage_total, coverage_by_file - + except Exception as e: logger.warning(f"Failed to parse coverage XML: {e}") - return 0.0, {} \ No newline at end of file + return 0.0, {} diff --git a/qualityflow/steps/select_input.py b/qualityflow/steps/select_input.py index c274e8df..ff16e391 100644 --- a/qualityflow/steps/select_input.py +++ b/qualityflow/steps/select_input.py @@ -3,6 +3,7 @@ """ from typing import Annotated, Dict + from zenml import step from zenml.logger import get_logger @@ -17,22 +18,22 @@ def select_input( ) -> Annotated[Dict[str, str], "source_spec"]: """ Resolve source specification for test generation. - + Args: repo_url: Repository URL to analyze ref: Git reference (branch, tag, commit) target_glob: Glob pattern for target files - + Returns: Source specification dictionary """ logger.info(f"Selecting input source: {repo_url}@{ref}") - + spec = { "repo_url": repo_url, "ref": ref, "target_glob": target_glob, } - + logger.info(f"Source spec: {spec}") - return spec \ No newline at end of file + return spec From f04f49072838d75fae5eee08165460257e64dfc5 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sun, 24 Aug 2025 22:33:07 +0200 Subject: [PATCH 3/8] Update loading prompt template and add log for missing file --- qualityflow/steps/gen_tests_agent.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py index 9a918afc..5026891f 100644 --- a/qualityflow/steps/gen_tests_agent.py +++ b/qualityflow/steps/gen_tests_agent.py @@ -62,19 +62,24 @@ def gen_tests_agent( tests_dir = tempfile.mkdtemp(prefix="qualityflow_agent_tests_") tests_path = Path(tests_dir) - # Load prompt template - workspace_path = Path(workspace_dir) - prompt_file = workspace_path / prompt_path + # Load prompt template from QualityFlow project directory + # Note: workspace_dir is the cloned repo, but prompts are in QualityFlow project + project_root = Path(__file__).parent.parent # Go up from steps/ to project root + prompt_file = project_root / prompt_path if prompt_file.exists(): with open(prompt_file, "r") as f: prompt_template = f.read() + logger.info(f"Loaded prompt template from {prompt_file}") else: # Use default template if file doesn't exist prompt_template = _get_default_prompt_template() - logger.info(f"Using default prompt template, {prompt_path} not found") + logger.info(f"Using default prompt template, {prompt_path} not found at {prompt_file}") template = Template(prompt_template) + + # Keep workspace_path for reading source files + workspace_path = Path(workspace_dir) total_tokens_in = 0 total_tokens_out = 0 From da184b478bc17da8817e877e5eefbfda69cd956a Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 25 Aug 2025 08:30:28 +0200 Subject: [PATCH 4/8] Remove unnecessary 'ast' requirement from file --- qualityflow/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qualityflow/requirements.txt b/qualityflow/requirements.txt index 72c59212..2d3f977e 100644 --- a/qualityflow/requirements.txt +++ b/qualityflow/requirements.txt @@ -12,7 +12,7 @@ pytest-cov>=4.0.0,<5.0.0 coverage>=7.0.0,<8.0.0 # Code Analysis -ast>=3.9 +# ast is built-in, no need to install # Git Integration gitpython>=3.1.0,<4.0.0 From 44c095455f075a231f88ab8bcce54291dff44add Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 25 Aug 2025 11:23:16 +0200 Subject: [PATCH 5/8] Update test generation pipeline for QualityFlow.- Improved configurations and code organization --- qualityflow/README.md | 110 ++++++++---------------- qualityflow/run.py | 12 ++- qualityflow/steps/fetch_source.py | 3 + qualityflow/steps/gen_tests_agent.py | 10 ++- qualityflow/steps/gen_tests_baseline.py | 4 + qualityflow/steps/report.py | 4 + 6 files changed, 64 insertions(+), 79 deletions(-) diff --git a/qualityflow/README.md b/qualityflow/README.md index 490e9aa2..b7a76022 100644 --- a/qualityflow/README.md +++ b/qualityflow/README.md @@ -63,37 +63,39 @@ The main pipeline handles the complete test generation workflow: โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ ``` -## ๐Ÿ“ฆ Quick Start +## ๐Ÿš€ Quick Start -### Prerequisites - -- Python 3.9+ -- ZenML installed (`pip install zenml`) -- Git -- OpenAI API key (optional, can use fake provider) - -### Setup +Get QualityFlow running in 3 simple steps: +### 1. Install Dependencies ```bash pip install -r requirements.txt ``` -2. **Set up OpenAI (optional)**: +### 2. Optional: Set up OpenAI API Key ```bash export OPENAI_API_KEY="your-api-key-here" ``` +*Skip this step to use the fake provider for testing* -3. **Run the pipeline**: +### 3. Run the Pipeline ```bash python run.py ``` -That's it! The pipeline will: -- Clone the configured repository (default: requests library) -- Analyze Python files and select candidates -- Generate tests using OpenAI (or fake provider if no API key) +**That's it!** The pipeline will automatically: +- Clone a sample repository (requests library by default) +- Analyze Python files and select test candidates +- Generate tests using LLM or fake provider - Run tests and measure coverage -- Generate a comprehensive report comparing approaches +- Create a detailed comparison report + +### What Happens Next? + +- Check the ZenML dashboard to see pipeline results +- View generated test files and coverage reports +- Compare LLM vs baseline test approaches +- Experiment with different configurations ## โš™๏ธ Configuration @@ -171,18 +173,17 @@ Requirements: ### A/B Testing Experiments -Use run templates for systematic comparisons: +Compare different configurations by running with different config files: ```bash # Compare prompt versions -python scripts/run_experiment.py --config configs/experiment.default.yaml -python scripts/run_experiment.py --config configs/experiment.strict.yaml +python run.py --config configs/experiment.default.yaml +python run.py --config configs/experiment.strict.yaml -# Compare in ZenML dashboard: +# Compare results in ZenML dashboard: # - Coverage metrics # - Test quality scores # - Token usage and cost -# - Promotion decisions ``` ### Production Deployment @@ -199,36 +200,23 @@ zenml stack register production_stack \ -a s3_store -c ecr_registry -o k8s_orchestrator --set ``` -### Scheduled Regression - -Register batch regression for daily execution: +### Scheduled Execution -```bash -python scripts/run_batch.py --config configs/schedule.batch.yaml --schedule -``` +For automated runs, set up scheduled execution using your preferred orchestration tool or ZenML's scheduling features. ## ๐Ÿ—๏ธ Project Structure ``` qualityflow/ โ”œโ”€โ”€ README.md -โ”œโ”€โ”€ pyproject.toml โ”œโ”€โ”€ requirements.txt -โ”œโ”€โ”€ .env.example -โ”œโ”€โ”€ zenml.yaml โ”‚ โ”œโ”€โ”€ configs/ # Pipeline configurations โ”‚ โ”œโ”€โ”€ experiment.default.yaml # Standard experiment settings -โ”‚ โ”œโ”€โ”€ experiment.strict.yaml # High-quality gates -โ”‚ โ””โ”€โ”€ schedule.batch.yaml # Batch regression schedule -โ”‚ -โ”œโ”€โ”€ domain/ # Core data models -โ”‚ โ”œโ”€โ”€ schema.py # Pydantic models -โ”‚ โ””โ”€โ”€ stages.py # Deployment stages +โ”‚ โ””โ”€โ”€ experiment.strict.yaml # High-quality gates โ”‚ โ”œโ”€โ”€ pipelines/ # Pipeline definitions -โ”‚ โ”œโ”€โ”€ generate_and_evaluate.py # Experiment pipeline -โ”‚ โ””โ”€โ”€ batch_regression.py # Scheduled regression +โ”‚ โ””โ”€โ”€ generate_and_evaluate.py # Main pipeline โ”‚ โ”œโ”€โ”€ steps/ # Pipeline steps โ”‚ โ”œโ”€โ”€ select_input.py # Source specification @@ -237,43 +225,27 @@ qualityflow/ โ”‚ โ”œโ”€โ”€ gen_tests_agent.py # LLM test generation โ”‚ โ”œโ”€โ”€ gen_tests_baseline.py # Heuristic test generation โ”‚ โ”œโ”€โ”€ run_tests.py # Test execution & coverage -โ”‚ โ”œโ”€โ”€ evaluate_coverage.py # Metrics & gate evaluation -โ”‚ โ”œโ”€โ”€ compare_and_promote.py # Model registry promotion -โ”‚ โ”œโ”€โ”€ resolve_test_pack.py # Test pack resolution +โ”‚ โ”œโ”€โ”€ evaluate_coverage.py # Metrics evaluation โ”‚ โ””โ”€โ”€ report.py # Report generation โ”‚ โ”œโ”€โ”€ prompts/ # Jinja2 prompt templates โ”‚ โ”œโ”€โ”€ unit_test_v1.jinja # Standard test generation โ”‚ โ””โ”€โ”€ unit_test_strict_v2.jinja # Comprehensive test generation โ”‚ -โ”œโ”€โ”€ materializers/ # Custom artifact handling -โ”œโ”€โ”€ utils/ # Utility functions -โ”‚ -โ”œโ”€โ”€ registry/ # Test Pack registry docs -โ”‚ โ””โ”€โ”€ README.md -โ”‚ -โ”œโ”€โ”€ run_templates/ # Experiment templates -โ”‚ โ”œโ”€โ”€ ab_agent_vs_strict.json # A/B testing configuration -โ”‚ โ””โ”€โ”€ baseline_only.json # Baseline establishment -โ”‚ -โ”œโ”€โ”€ scripts/ # CLI scripts -โ”‚ โ”œโ”€โ”€ run_experiment.py # Experiment runner -โ”‚ โ””โ”€โ”€ run_batch.py # Batch regression runner +โ”œโ”€โ”€ examples/ # Demo code for testing +โ”‚ โ””โ”€โ”€ toy_lib/ # Sample library +โ”‚ โ”œโ”€โ”€ calculator.py +โ”‚ โ””โ”€โ”€ string_utils.py โ”‚ -โ””โ”€โ”€ examples/ # Demo code for testing - โ””โ”€โ”€ toy_lib/ # Sample library - โ”œโ”€โ”€ calculator.py - โ””โ”€โ”€ string_utils.py +โ””โ”€โ”€ run.py # Main entry point ``` ### Key Components -- **Domain Models**: Pydantic schemas for type safety and validation - **Pipeline Steps**: Modular, reusable components with clear interfaces - **Prompt Templates**: Jinja2 templates for LLM test generation -- **Configuration**: YAML-driven experiment and deployment settings -- **Quality Gates**: Configurable thresholds for coverage and promotion -- **Model Registry**: ZenML Model Registry integration for test pack versioning +- **Configuration**: YAML-driven experiment settings +- **Test Generation**: Both LLM-based and heuristic approaches for comparison ## ๐Ÿš€ Production Deployment @@ -295,17 +267,7 @@ zenml stack register production \ ### Scheduled Execution -Set up automated regression testing: - -```bash -# Register schedule (example with ZenML Cloud) -python scripts/run_batch.py --config configs/schedule.batch.yaml --schedule - -# Monitor via dashboard: -# - Daily regression results -# - Coverage trend analysis -# - Test pack performance -``` +Set up automated regression testing using ZenML's scheduling capabilities or your preferred orchestration platform. ## ๐Ÿค Contributing @@ -344,7 +306,7 @@ Run with debug logging: ```bash export ZENML_LOGGING_VERBOSITY=DEBUG -python scripts/run_experiment.py --config configs/experiment.default.yaml +python run.py --config configs/experiment.default.yaml ``` ## ๐Ÿ“š Resources diff --git a/qualityflow/run.py b/qualityflow/run.py index c9ff7370..40366aac 100644 --- a/qualityflow/run.py +++ b/qualityflow/run.py @@ -3,6 +3,7 @@ """ from pathlib import Path +from typing import Union import click from pipelines import generate_and_evaluate @@ -26,15 +27,20 @@ default=False, help="Disable pipeline caching and force fresh execution", ) -def main(config: str | None, no_cache: bool): +def main(config: Union[str, None], no_cache: bool): """Run QualityFlow test generation and coverage analysis pipeline. Simple pipeline that generates tests using LLM, runs them, measures coverage, and compares results against baseline approaches. """ - project_root = Path(__file__).parent - default_config = project_root / "configs" / "experiment.default.yaml" + try: + project_root = Path(__file__).resolve().parent + default_config = project_root / "configs" / "experiment.default.yaml" + except Exception: + # Fallback to current working directory + default_config = Path.cwd() / "configs" / "experiment.default.yaml" + chosen_config = config or str(default_config) try: diff --git a/qualityflow/steps/fetch_source.py b/qualityflow/steps/fetch_source.py index dfbfd609..82a88f5a 100644 --- a/qualityflow/steps/fetch_source.py +++ b/qualityflow/steps/fetch_source.py @@ -1,5 +1,8 @@ """ Fetch source code workspace step. + +This module provides functionality to clone Git repositories and prepare +workspaces for code analysis and test generation. """ import subprocess diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py index 5026891f..b995490e 100644 --- a/qualityflow/steps/gen_tests_agent.py +++ b/qualityflow/steps/gen_tests_agent.py @@ -64,8 +64,14 @@ def gen_tests_agent( # Load prompt template from QualityFlow project directory # Note: workspace_dir is the cloned repo, but prompts are in QualityFlow project - project_root = Path(__file__).parent.parent # Go up from steps/ to project root - prompt_file = project_root / prompt_path + try: + # Try to resolve project root more robustly + current_file = Path(__file__).resolve() + project_root = current_file.parent.parent # Go up from steps/ to project root + prompt_file = project_root / prompt_path + except Exception: + # Fallback to current working directory if path resolution fails + prompt_file = Path.cwd() / prompt_path if prompt_file.exists(): with open(prompt_file, "r") as f: diff --git a/qualityflow/steps/gen_tests_baseline.py b/qualityflow/steps/gen_tests_baseline.py index a3197748..db712e81 100644 --- a/qualityflow/steps/gen_tests_baseline.py +++ b/qualityflow/steps/gen_tests_baseline.py @@ -1,5 +1,9 @@ """ Generate baseline/skeleton tests using heuristics. + +This module creates simple test templates by analyzing Python AST to identify +functions and classes, generating skeleton test code for comparison with +LLM-generated tests. """ import ast diff --git a/qualityflow/steps/report.py b/qualityflow/steps/report.py index 628d5d07..3d7fcdc5 100644 --- a/qualityflow/steps/report.py +++ b/qualityflow/steps/report.py @@ -1,5 +1,9 @@ """ Generate comprehensive pipeline report. + +This module creates detailed markdown reports comparing LLM-generated tests +against baseline tests, including coverage metrics, quality assessments, +and recommendations for improvement. """ import tempfile From 7294da94a8c6e524f64877d1267260c4eba8461d Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 25 Aug 2025 12:16:44 +0200 Subject: [PATCH 6/8] Add local examples testing option --- qualityflow/README.md | 33 +++- qualityflow/configs/experiment.local.yaml | 40 +++++ .../pipelines/generate_and_evaluate.py | 8 +- qualityflow/run.py | 2 +- qualityflow/steps/analyze_code.py | 11 +- qualityflow/steps/fetch_source.py | 48 +++++- qualityflow/steps/gen_tests_agent.py | 156 ++++++++++++++++-- qualityflow/steps/report.py | 21 +-- qualityflow/steps/run_tests.py | 2 - 9 files changed, 271 insertions(+), 50 deletions(-) create mode 100644 qualityflow/configs/experiment.local.yaml diff --git a/qualityflow/README.md b/qualityflow/README.md index b7a76022..8b183404 100644 --- a/qualityflow/README.md +++ b/qualityflow/README.md @@ -97,6 +97,16 @@ python run.py - Compare LLM vs baseline test approaches - Experiment with different configurations +### Local Testing Option + +For offline development or controlled testing, use the local examples: + +```bash +python run.py --config configs/experiment.local.yaml +``` + +This uses the included `examples/toy_lib/` code instead of cloning external repositories. + ## โš™๏ธ Configuration ### Key Parameters @@ -125,14 +135,17 @@ steps: ### Pipeline Options ```bash -# Use fake provider (no API key needed) -python run.py # Uses config defaults +# Default: uses remote repository (requests library) +python run.py -# Force fresh execution (no caching) -python run.py --no-cache +# Local testing with included examples +python run.py --config configs/experiment.local.yaml -# Use different config +# High-quality test generation python run.py --config configs/experiment.strict.yaml + +# Force fresh execution (no caching) +python run.py --no-cache ``` ## ๐Ÿ”ฌ Advanced Usage @@ -213,7 +226,8 @@ qualityflow/ โ”‚ โ”œโ”€โ”€ configs/ # Pipeline configurations โ”‚ โ”œโ”€โ”€ experiment.default.yaml # Standard experiment settings -โ”‚ โ””โ”€โ”€ experiment.strict.yaml # High-quality gates +โ”‚ โ”œโ”€โ”€ experiment.strict.yaml # High-quality gates +โ”‚ โ””โ”€โ”€ experiment.local.yaml # Local examples testing โ”‚ โ”œโ”€โ”€ pipelines/ # Pipeline definitions โ”‚ โ””โ”€โ”€ generate_and_evaluate.py # Main pipeline @@ -233,9 +247,9 @@ qualityflow/ โ”‚ โ””โ”€โ”€ unit_test_strict_v2.jinja # Comprehensive test generation โ”‚ โ”œโ”€โ”€ examples/ # Demo code for testing -โ”‚ โ””โ”€โ”€ toy_lib/ # Sample library -โ”‚ โ”œโ”€โ”€ calculator.py -โ”‚ โ””โ”€โ”€ string_utils.py +โ”‚ โ””โ”€โ”€ toy_lib/ # Sample library with test-friendly code +โ”‚ โ”œโ”€โ”€ calculator.py # Calculator class with edge cases +โ”‚ โ””โ”€โ”€ string_utils.py # String utilities with validation โ”‚ โ””โ”€โ”€ run.py # Main entry point ``` @@ -246,6 +260,7 @@ qualityflow/ - **Prompt Templates**: Jinja2 templates for LLM test generation - **Configuration**: YAML-driven experiment settings - **Test Generation**: Both LLM-based and heuristic approaches for comparison +- **Example Code**: Sample Python modules (`toy_lib`) designed for effective test generation demonstration ## ๐Ÿš€ Production Deployment diff --git a/qualityflow/configs/experiment.local.yaml b/qualityflow/configs/experiment.local.yaml new file mode 100644 index 00000000..477f089e --- /dev/null +++ b/qualityflow/configs/experiment.local.yaml @@ -0,0 +1,40 @@ +# QualityFlow Local Examples Configuration +# Use local toy_lib examples instead of remote repositories + +# Pipeline configuration +name: "generate_and_evaluate" +version: "1.0" + +# Source configuration - using local examples +steps: + select_input: + parameters: + # Use local examples instead of remote repo + repo_url: "local" + ref: "main" + target_glob: "examples/**/*.py" # Target the toy_lib examples + + analyze_code: + parameters: + strategy: "all" # Include all example files + max_files: 5 # Process all toy_lib files + + # LLM generation configuration + gen_tests_agent: + parameters: + provider: "fake" # Use fake provider by default for local testing + model: "gpt-4o-mini" + prompt_path: "prompts/unit_test_v1.jinja" + max_tests_per_file: 3 + max_files: 5 # Process all toy_lib files + + # Baseline test generation + gen_tests_baseline: + parameters: + enabled: true + max_files: 5 # Match agent max_files + +# Resource configuration +settings: + docker: + requirements: requirements.txt \ No newline at end of file diff --git a/qualityflow/pipelines/generate_and_evaluate.py b/qualityflow/pipelines/generate_and_evaluate.py index e359afb8..7050b5bd 100644 --- a/qualityflow/pipelines/generate_and_evaluate.py +++ b/qualityflow/pipelines/generate_and_evaluate.py @@ -32,10 +32,12 @@ def generate_and_evaluate() -> None: workspace_dir, commit_sha = fetch_source(spec) # Step 3: Analyze and select code files - code_summary = analyze_code(workspace_dir, commit_sha) + code_summary = analyze_code(workspace_dir, commit_sha, spec) # Step 4: Generate tests using LLM agent - agent_tests_dir, prompt_used = gen_tests_agent(workspace_dir, code_summary) + agent_tests_dir, test_summary = gen_tests_agent( + workspace_dir, code_summary + ) # Step 5: Generate baseline tests (optional) baseline_tests_dir = gen_tests_baseline(workspace_dir, code_summary) @@ -52,7 +54,7 @@ def generate_and_evaluate() -> None: report( workspace_dir, commit_sha, - prompt_used, + test_summary, agent_results, baseline_results, ) diff --git a/qualityflow/run.py b/qualityflow/run.py index 40366aac..b4a9c513 100644 --- a/qualityflow/run.py +++ b/qualityflow/run.py @@ -40,7 +40,7 @@ def main(config: Union[str, None], no_cache: bool): except Exception: # Fallback to current working directory default_config = Path.cwd() / "configs" / "experiment.default.yaml" - + chosen_config = config or str(default_config) try: diff --git a/qualityflow/steps/analyze_code.py b/qualityflow/steps/analyze_code.py index 3a8f5a14..7cc5822c 100644 --- a/qualityflow/steps/analyze_code.py +++ b/qualityflow/steps/analyze_code.py @@ -28,7 +28,7 @@ class SelectionStrategy(str, Enum): def analyze_code( workspace_dir: Path, commit_sha: str, - target_glob: str = "src/**/*.py", + source_spec: Dict[str, str], strategy: SelectionStrategy = SelectionStrategy.LOW_COVERAGE, max_files: int = 10, ) -> Annotated[Dict, "code_summary"]: @@ -38,14 +38,19 @@ def analyze_code( Args: workspace_dir: Path to workspace directory commit_sha: Git commit SHA - target_glob: Glob pattern for target files + source_spec: Source specification containing target_glob and other settings strategy: File selection strategy max_files: Maximum number of files to select Returns: Code summary dictionary containing selected files and metadata """ - logger.info(f"Analyzing code in {workspace_dir} with strategy {strategy}") + # Extract target_glob from source spec + target_glob = source_spec.get("target_glob", "src/**/*.py") + + logger.info( + f"Analyzing code in {workspace_dir} with strategy {strategy} and glob {target_glob}" + ) workspace_path = Path(workspace_dir) diff --git a/qualityflow/steps/fetch_source.py b/qualityflow/steps/fetch_source.py index 82a88f5a..cdf37548 100644 --- a/qualityflow/steps/fetch_source.py +++ b/qualityflow/steps/fetch_source.py @@ -21,7 +21,7 @@ def fetch_source( source_spec: Dict[str, str], ) -> Tuple[Annotated[Path, "workspace_dir"], Annotated[str, "commit_sha"]]: """ - Fetch and materialize workspace from git repository. + Fetch and materialize workspace from git repository or use local examples. Args: source_spec: Source specification from select_input step @@ -32,9 +32,53 @@ def fetch_source( repo_url = source_spec["repo_url"] ref = source_spec["ref"] + # Handle local examples case + if repo_url == "local": + logger.info("Using local QualityFlow examples") + try: + # Get the project root (QualityFlow directory) + current_file = Path(__file__).resolve() + project_root = ( + current_file.parent.parent + ) # Go up from steps/ to project root + + # Create temporary workspace and copy examples + workspace_dir = tempfile.mkdtemp( + prefix="qualityflow_local_workspace_" + ) + workspace_path = Path(workspace_dir) + + # Copy examples directory to the temporary workspace + import shutil + + examples_src = project_root / "examples" + examples_dest = workspace_path / "examples" + + if examples_src.exists(): + shutil.copytree(examples_src, examples_dest) + logger.info( + f"Copied examples from {examples_src} to {examples_dest}" + ) + else: + logger.warning( + f"Examples directory not found at {examples_src}" + ) + + commit_sha = "local-examples" + logger.info(f"Local workspace ready at {workspace_path}") + return workspace_path, commit_sha + + except Exception as e: + logger.error(f"Failed to set up local workspace: {e}") + # Fallback to current working directory + workspace_dir = tempfile.mkdtemp( + prefix="qualityflow_fallback_workspace_" + ) + return Path(workspace_dir), "local-fallback" + logger.info(f"Fetching source from {repo_url}@{ref}") - # Create temporary workspace + # Create temporary workspace for remote repositories workspace_dir = tempfile.mkdtemp(prefix="qualityflow_workspace_") workspace_path = Path(workspace_dir) diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py index b995490e..de879731 100644 --- a/qualityflow/steps/gen_tests_agent.py +++ b/qualityflow/steps/gen_tests_agent.py @@ -10,6 +10,7 @@ from jinja2 import Template from zenml import log_metadata, step from zenml.logger import get_logger +from zenml.types import MarkdownString class GenerationProvider(str, Enum): @@ -32,7 +33,10 @@ def gen_tests_agent( prompt_path: str = "prompts/unit_test_v1.jinja", max_tests_per_file: int = 3, max_files: int = 10, -) -> Tuple[Annotated[Path, "agent_tests_dir"], Annotated[str, "prompt_used"]]: +) -> Tuple[ + Annotated[Path, "agent_tests_dir"], + Annotated[MarkdownString, "test_summary"], +]: """Generate tests using LLM agent. Args: @@ -45,7 +49,7 @@ def gen_tests_agent( max_files: Maximum number of files to process (for speed control) Returns: - Tuple of test directory and prompt used + Tuple of test directory and test generation summary """ # Extract selected files from code summary selected_files = code_summary.get("selected_files", []) @@ -67,7 +71,9 @@ def gen_tests_agent( try: # Try to resolve project root more robustly current_file = Path(__file__).resolve() - project_root = current_file.parent.parent # Go up from steps/ to project root + project_root = ( + current_file.parent.parent + ) # Go up from steps/ to project root prompt_file = project_root / prompt_path except Exception: # Fallback to current working directory if path resolution fails @@ -80,16 +86,19 @@ def gen_tests_agent( else: # Use default template if file doesn't exist prompt_template = _get_default_prompt_template() - logger.info(f"Using default prompt template, {prompt_path} not found at {prompt_file}") + logger.info( + f"Using default prompt template, {prompt_path} not found at {prompt_file}" + ) template = Template(prompt_template) - + # Keep workspace_path for reading source files workspace_path = Path(workspace_dir) total_tokens_in = 0 total_tokens_out = 0 - materialized_prompts = {} # Store materialized prompts per file + test_snippets = {} # Store test snippets per file + test_stats = {} # Store test statistics per file for file_path in files_to_process: logger.info(f"Generating tests for {file_path}") @@ -109,8 +118,15 @@ def gen_tests_agent( ), ) - # Store the materialized prompt for this file - materialized_prompts[file_path] = materialized_prompt + # Store test generation info for this file + test_stats[file_path] = { + "provider": provider.value, + "model": model, + "max_tests": max_tests_per_file, + "complexity_score": code_summary.get("complexity_scores", {}).get( + file_path, 0 + ), + } # Generate tests using provider if provider == GenerationProvider.FAKE: @@ -138,9 +154,26 @@ def gen_tests_agent( with open(test_file_path, "w") as f: f.write(generated_tests) + # Store test snippet for summary (first 20 lines) + test_lines = generated_tests.split("\n") + snippet_lines = test_lines[:20] + if len(test_lines) > 20: + snippet_lines.append("... (truncated)") + test_snippets[file_path] = "\n".join(snippet_lines) + + # Update test stats with actual counts + test_stats[file_path]["lines_generated"] = len(test_lines) + test_stats[file_path]["test_functions"] = len( + [ + line + for line in test_lines + if line.strip().startswith("def test_") + ] + ) + logger.info(f"Generated tests saved to {test_file_path}") - # Log comprehensive metadata including materialized prompts + # Log comprehensive metadata metadata = { "token_usage": { "tokens_in": total_tokens_in, @@ -156,8 +189,7 @@ def gen_tests_agent( "max_tests_per_file": max_tests_per_file, "files_processed": len(files_to_process), }, - "materialized_prompts": materialized_prompts, - "prompt_template": prompt_template, + "test_stats": test_stats, } log_metadata(metadata) @@ -165,11 +197,104 @@ def gen_tests_agent( f"Test generation complete. Files: {len(files_to_process)}, Tokens: {total_tokens_in} in / {total_tokens_out} out" ) - # Create a better prompt summary for the report - prompt_summary = f"Template: {prompt_path}\nProvider: {provider.value}\nModel: {model}\nFiles processed: {len(files_to_process)}" + # Create test generation summary + test_summary = _create_test_summary( + provider, + model, + prompt_path, + files_to_process, + test_snippets, + test_stats, + total_tokens_in, + total_tokens_out, + ) # Return Path object - ZenML will automatically materialize the folder - return Path(tests_dir), prompt_summary + return Path(tests_dir), test_summary + + +def _create_test_summary( + provider: GenerationProvider, + model: str, + prompt_path: str, + files_processed: list, + test_snippets: Dict[str, str], + test_stats: Dict[str, Dict], + total_tokens_in: int, + total_tokens_out: int, +) -> MarkdownString: + """Create a markdown summary of test generation results.""" + + # Calculate totals + total_lines = sum( + stats.get("lines_generated", 0) for stats in test_stats.values() + ) + total_test_functions = sum( + stats.get("test_functions", 0) for stats in test_stats.values() + ) + + # Handle edge case of no files processed + if len(files_processed) == 0: + summary = f"""# ๐Ÿงช Test Generation Summary + +## Configuration +- **Provider**: {provider.value} +- **Model**: {model} +- **Prompt Template**: {prompt_path} +- **Files Processed**: 0 + +## Generation Statistics +โš ๏ธ **No files were processed for test generation.** + +This could happen if: +- No files matched the target glob pattern +- All files were filtered out during analysis +- Max files limit was set to 0 + +**Token Usage**: {total_tokens_in:,} in / {total_tokens_out:,} out +""" + return MarkdownString(summary) + + # Build markdown content for successful processing + avg_tests = total_test_functions / len(files_processed) + summary = f"""# ๐Ÿงช Test Generation Summary + +## Configuration +- **Provider**: {provider.value} +- **Model**: {model} +- **Prompt Template**: {prompt_path} +- **Files Processed**: {len(files_processed)} + +## Generation Statistics +- **Total Lines Generated**: {total_lines:,} +- **Total Test Functions**: {total_test_functions} +- **Average Tests per File**: {avg_tests:.1f} +- **Token Usage**: {total_tokens_in:,} in / {total_tokens_out:,} out + +## Generated Tests by File + +""" + + for file_path in files_processed: + stats = test_stats.get(file_path, {}) + snippet = test_snippets.get(file_path, "") + + complexity = stats.get("complexity_score", 0) + lines = stats.get("lines_generated", 0) + test_count = stats.get("test_functions", 0) + + summary += f"""### ๐Ÿ“„ `{file_path}` +**Complexity Score**: {complexity:.1f} | **Lines**: {lines} | **Test Functions**: {test_count} + +``` +{snippet} +``` + +--- + +""" + + return MarkdownString(summary) def _get_default_prompt_template() -> str: @@ -197,9 +322,6 @@ def _generate_fake_tests( file_path: str, source_code: str, max_tests: int ) -> Tuple[str, Dict]: """Generate fake/mock tests for development/testing.""" - # Create a simple module name from file path - module_name = file_path.replace("/", ".").replace(".py", "") - test_content = f'''""" Generated tests for {file_path} """ diff --git a/qualityflow/steps/report.py b/qualityflow/steps/report.py index 3d7fcdc5..ab8e564d 100644 --- a/qualityflow/steps/report.py +++ b/qualityflow/steps/report.py @@ -22,7 +22,7 @@ def report( workspace_dir: Path, commit_sha: str, - prompt_used: str, + test_summary: MarkdownString, agent_results: Dict, baseline_results: Optional[Dict], ) -> Annotated[MarkdownString, "final_report"]: @@ -32,12 +32,12 @@ def report( Args: workspace_dir: Workspace directory path commit_sha: Git commit SHA - prompt_used: Prompt template used + test_summary: Test generation summary with snippets agent_results: Agent test results baseline_results: Baseline test results (optional) Returns: - Path to generated markdown report + Markdown report as string """ logger.info("Generating pipeline execution report") @@ -55,7 +55,7 @@ def report( report_content = _generate_report_content( workspace_dir, commit_sha, - prompt_used, + test_summary, agent_results, baseline_results, evaluation_metrics, @@ -131,7 +131,7 @@ def _evaluate_coverage_metrics( def _generate_report_content( workspace_dir: Path, commit_sha: str, - prompt_used: str, + test_summary: MarkdownString, agent_results: Dict, baseline_results: Optional[Dict], evaluation_metrics: Dict, @@ -236,15 +236,10 @@ def _generate_report_content( else: report += "๐Ÿ“‰ **Baseline performs as well or better** - review agent configuration.\n" - # Configuration section - report += """## Configuration + # Test generation details section + report += f"""## Test Generation Details -### Prompt Template -``` -""" - report += prompt_used[:500] + ("..." if len(prompt_used) > 500 else "") - report += """ -``` +{test_summary} ### File Coverage Details """ diff --git a/qualityflow/steps/run_tests.py b/qualityflow/steps/run_tests.py index 4c8697f3..4ad3edc9 100644 --- a/qualityflow/steps/run_tests.py +++ b/qualityflow/steps/run_tests.py @@ -250,8 +250,6 @@ def _parse_coverage_xml(coverage_file: Path) -> tuple[float, Dict[str, float]]: # If still no coverage found, try branches-valid attribute (alternative format) if coverage_total == 0.0: - branches_valid = root.get("branches-valid", "0") - branches_covered = root.get("branches-covered", "0") lines_valid = root.get("lines-valid", "0") lines_covered = root.get("lines-covered", "0") From 5db0a78d8ce099c9c46308be880140673c1825bf Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 25 Aug 2025 12:26:01 +0200 Subject: [PATCH 7/8] Update project Dockerfile link and add QualityFlow project --- ADDING_PROJECTS.md | 2 +- README.md | 1 + llm-complete-guide/README.md | 4 ++-- qualityflow/README.md | 3 --- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/ADDING_PROJECTS.md b/ADDING_PROJECTS.md index a169560d..f1c81b9d 100644 --- a/ADDING_PROJECTS.md +++ b/ADDING_PROJECTS.md @@ -57,7 +57,7 @@ ENV ZENML_ENABLE_TUTORIAL=true ### When No Dockerfile is Needed If your project only requires Python dependencies listed in `requirements.txt`, **do not include a Dockerfile**. The projects backend will automatically build your project using the generic Dockerfile available at: -[https://github.com/zenml-io/zenml-projects-backend/blob/main/.docker/project.Dockerfile](https://github.com/zenml-io/zenml-projects-backend/blob/main/.docker/project.Dockerfile) +[https://raw.githubusercontent.com/zenml-io/zenml-projects-backend/refs/heads/main/.docker/project.Dockerfile?token=GHSAT0AAAAAADISFM36XGBCROFV7ZUEFSUK2FMHITA](https://raw.githubusercontent.com/zenml-io/zenml-projects-backend/refs/heads/main/.docker/project.Dockerfile?token=GHSAT0AAAAAADISFM36XGBCROFV7ZUEFSUK2FMHITA) ## ๐Ÿ”ง Backend Integration diff --git a/README.md b/README.md index 27b762ba..46accab2 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ etc. | [Nightwatch AI](nightwatch-ai) | ๐Ÿค– LLMOps | ๐Ÿ“ Summarization, ๐Ÿ“Š Reporting | openai, supabase, slack | | [ResearchRadar](research-radar) | ๐Ÿค– LLMOps | ๐Ÿ“ Classification, ๐Ÿ“Š Comparison | anthropic, huggingface, transformers | | [Deep Research](deep_research) | ๐Ÿค– LLMOps | ๐Ÿ“ Research, ๐Ÿ“Š Reporting, ๐Ÿ” Web Search | anthropic, mcp, agents, openai | +| [QualityFlow](qualityflow) | ๐Ÿค– LLMOps | ๐Ÿงช Test Generation, ๐Ÿ“Š Coverage Analysis, โšก Automation | openai, anthropic, pytest, jinja2 | | [End-to-end Computer Vision](end-to-end-computer-vision) | ๐Ÿ‘ CV | ๐Ÿ”Ž Object Detection, ๐Ÿท๏ธ Labeling | pytorch, label_studio, yolov8 | | [Magic Photobooth](magic-photobooth) | ๐Ÿ‘ CV | ๐Ÿ“ท Image Gen, ๐ŸŽž๏ธ Video Gen | stable-diffusion, huggingface | | [OmniReader](omni-reader) | ๐Ÿ‘ CV | ๐Ÿ“‘ OCR, ๐Ÿ“Š Evaluation, โš™๏ธ Batch Processing | polars, litellm, openai, ollama | diff --git a/llm-complete-guide/README.md b/llm-complete-guide/README.md index f352d2bf..7fd23bad 100644 --- a/llm-complete-guide/README.md +++ b/llm-complete-guide/README.md @@ -235,7 +235,7 @@ python run.py synthetic You will also need to have set up and connected to an Argilla instance for this to work. Please follow the instructions in the [Argilla -documentation](https://docs.argilla.io/latest/getting_started/quickstart/) +documentation](https://docs.v1.argilla.io/en/latest/) to set up and connect to an Argilla instance on the Hugging Face Hub. [ZenML's Argilla integration documentation](https://docs.zenml.io/v/docs/stack-components/annotators/argilla) @@ -254,7 +254,7 @@ zenml secret update llm-complete -v '{"argilla_api_key": "YOUR_ARGILLA_API_KEY", As with the previous pipeline, you will need to have set up and connected to an Argilla instance for this to work. Please follow the instructions in the [Argilla -documentation](https://docs.argilla.io/latest/getting_started/quickstart/) +documentation](https://docs.v1.argilla.io/en/latest/) to set up and connect to an Argilla instance on the Hugging Face Hub. [ZenML's Argilla integration documentation](https://docs.zenml.io/v/docs/stack-components/annotators/argilla) diff --git a/qualityflow/README.md b/qualityflow/README.md index 8b183404..dd69e58e 100644 --- a/qualityflow/README.md +++ b/qualityflow/README.md @@ -289,7 +289,6 @@ Set up automated regression testing using ZenML's scheduling capabilities or you QualityFlow follows ZenML best practices and is designed to be extended: 1. **Add New LLM Providers**: Extend `gen_tests_agent.py` with new provider integrations -2. **Custom Materializers**: Create materializers for new artifact types 3. **Additional Metrics**: Expand evaluation capabilities with new quality metrics 4. **Selection Strategies**: Add new code selection algorithms @@ -327,8 +326,6 @@ python run.py --config configs/experiment.default.yaml ## ๐Ÿ“š Resources - [ZenML Documentation](https://docs.zenml.io/) -- [Model Control Plane](https://docs.zenml.io/user-guide/model-control-plane) -- [Kubernetes Orchestrator](https://docs.zenml.io/stacks/stack-components/orchestrators/kubernetes) --- From 80fbe1273b869497102b1ea4424fe2eb6f21fcfb Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 25 Aug 2025 12:28:42 +0200 Subject: [PATCH 8/8] Update link in ADDING_PROJECTS.md to zenml-projects-backend --- ADDING_PROJECTS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ADDING_PROJECTS.md b/ADDING_PROJECTS.md index f1c81b9d..35f26d3b 100644 --- a/ADDING_PROJECTS.md +++ b/ADDING_PROJECTS.md @@ -56,8 +56,7 @@ ENV ZENML_ENABLE_TUTORIAL=true ### When No Dockerfile is Needed -If your project only requires Python dependencies listed in `requirements.txt`, **do not include a Dockerfile**. The projects backend will automatically build your project using the generic Dockerfile available at: -[https://raw.githubusercontent.com/zenml-io/zenml-projects-backend/refs/heads/main/.docker/project.Dockerfile?token=GHSAT0AAAAAADISFM36XGBCROFV7ZUEFSUK2FMHITA](https://raw.githubusercontent.com/zenml-io/zenml-projects-backend/refs/heads/main/.docker/project.Dockerfile?token=GHSAT0AAAAAADISFM36XGBCROFV7ZUEFSUK2FMHITA) +If your project only requires Python dependencies listed in `requirements.txt`, **do not include a Dockerfile**. The projects backend will automatically build your project using the generic Dockerfile available at the zenml-projects-backend repo. ## ๐Ÿ”ง Backend Integration