From e0fac67e2c3e46d0c39203f3912887af2597d3bc Mon Sep 17 00:00:00 2001
From: Hamza Tahir <hamza@zenml.io>
Date: Sun, 24 Aug 2025 22:29:49 +0200
Subject: [PATCH 1/8] Added new projecg

---
 qualityflow/README.md                         | 358 ++++++++++++++++++
 qualityflow/configs/experiment.default.yaml   |  41 ++
 qualityflow/configs/experiment.strict.yaml    |  42 ++
 qualityflow/examples/toy_lib/__init__.py      |   5 +
 qualityflow/examples/toy_lib/calculator.py    |  75 ++++
 qualityflow/examples/toy_lib/string_utils.py  | 120 ++++++
 qualityflow/pipelines/__init__.py             |   5 +
 .../pipelines/generate_and_evaluate.py        |  63 +++
 qualityflow/prompts/unit_test_strict_v2.jinja |  99 +++++
 qualityflow/prompts/unit_test_v1.jinja        |  61 +++
 qualityflow/requirements.txt                  |  22 ++
 qualityflow/run.py                            |  55 +++
 qualityflow/steps/__init__.py                 |  21 +
 qualityflow/steps/analyze_code.py             | 151 ++++++++
 qualityflow/steps/evaluate_coverage.py        |  68 ++++
 qualityflow/steps/fetch_source.py             |  70 ++++
 qualityflow/steps/gen_tests_agent.py          | 358 ++++++++++++++++++
 qualityflow/steps/gen_tests_baseline.py       | 190 ++++++++++
 qualityflow/steps/report.py                   | 238 ++++++++++++
 qualityflow/steps/run_tests.py                | 258 +++++++++++++
 qualityflow/steps/select_input.py             |  38 ++
 21 files changed, 2338 insertions(+)
 create mode 100644 qualityflow/README.md
 create mode 100644 qualityflow/configs/experiment.default.yaml
 create mode 100644 qualityflow/configs/experiment.strict.yaml
 create mode 100644 qualityflow/examples/toy_lib/__init__.py
 create mode 100644 qualityflow/examples/toy_lib/calculator.py
 create mode 100644 qualityflow/examples/toy_lib/string_utils.py
 create mode 100644 qualityflow/pipelines/__init__.py
 create mode 100644 qualityflow/pipelines/generate_and_evaluate.py
 create mode 100644 qualityflow/prompts/unit_test_strict_v2.jinja
 create mode 100644 qualityflow/prompts/unit_test_v1.jinja
 create mode 100644 qualityflow/requirements.txt
 create mode 100644 qualityflow/run.py
 create mode 100644 qualityflow/steps/__init__.py
 create mode 100644 qualityflow/steps/analyze_code.py
 create mode 100644 qualityflow/steps/evaluate_coverage.py
 create mode 100644 qualityflow/steps/fetch_source.py
 create mode 100644 qualityflow/steps/gen_tests_agent.py
 create mode 100644 qualityflow/steps/gen_tests_baseline.py
 create mode 100644 qualityflow/steps/report.py
 create mode 100644 qualityflow/steps/run_tests.py
 create mode 100644 qualityflow/steps/select_input.py

diff --git a/qualityflow/README.md b/qualityflow/README.md
new file mode 100644
index 00000000..490e9aa2
--- /dev/null
+++ b/qualityflow/README.md
@@ -0,0 +1,358 @@
+# 🧪 QualityFlow: AI-Powered Test Generation Pipeline
+
+A streamlined MLOps pipeline for **automated test generation** using ZenML and LLMs. Generate comprehensive unit tests for your codebase, compare different approaches, and get detailed coverage analysis.
+
+## 🚀 Product Overview
+
+QualityFlow demonstrates how to build production-ready MLOps workflows for automated test generation using Large Language Models. Built with ZenML, it provides a simple yet powerful pipeline for generating and evaluating AI-generated tests.
+
+**Focus**: **LLM-Powered Test Generation** and **Coverage Analysis**.
+
+### Key Features
+
+- **Real LLM Integration**: OpenAI and Anthropic providers for intelligent test generation
+- **Smart File Selection**: Configurable strategies to focus on files that need testing
+- **Baseline Comparison**: Compare LLM-generated tests vs heuristic baseline tests
+- **Coverage Analysis**: Real coverage metrics with detailed reporting
+- **Speed Controls**: `max_files` parameters to control pipeline execution time
+- **Containerized Ready**: Uses ZenML Path artifacts for remote execution
+- **Cost Tracking**: Token usage and cost estimation with metadata logging
+
+## 💡 How It Works
+
+### ✈️ Pipeline Architecture
+
+QualityFlow consists of a single, focused pipeline:
+
+#### Generate & Evaluate Pipeline
+
+The main pipeline handles the complete test generation workflow:
+
+1. **Source Selection** - Specify repository and target files
+2. **Code Fetching** - Clone and materialize workspace 
+3. **Code Analysis** - Select files for testing (with max_files limit)
+4. **LLM Test Generation** - Generate tests using OpenAI/Anthropic/fake providers
+5. **Baseline Generation** - Create simple heuristic tests for comparison
+6. **Test Execution** - Run both test suites with coverage analysis
+7. **Report Generation** - Compare results and generate markdown reports
+
+### 🔧 Architecture
+
+```
+┌─────────────────┐    ┌─────────────────┐    ┌─────────────────┐
+│   Git Repo      │    │  LLM Providers  │    │   Test Reports  │
+│                 │    │                 │    │                 │
+│  src/**/*.py    │────│▶ OpenAI/Claude  │────│▶ Coverage       │
+│  target files   │    │  Fake (testing) │    │  Comparisons    │
+│                 │    │                 │    │  Cost Tracking  │
+└─────────────────┘    └─────────────────┘    └─────────────────┘
+         │                                             ▲
+         ▼                                             │
+┌─────────────────────────────────────────────────────────────────┐
+│                   QualityFlow Pipeline                          │
+│                                                                 │
+│  ┌─────────────────────────────────────────────────────────┐   │
+│  │                Generate & Evaluate                      │   │
+│  │                                                         │   │
+│  │ 1. Select Input    → 2. Fetch Source    → 3. Analyze   │   │
+│  │ 4. Generate (LLM)  → 5. Generate (Base) → 6. Run Tests │   │
+│  │ 7. Run Tests       → 8. Report & Compare               │   │
+│  │                                                         │   │
+│  │ Features: max_files control, Path artifacts, metadata  │   │
+│  └─────────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## 📦 Quick Start
+
+### Prerequisites
+
+- Python 3.9+
+- ZenML installed (`pip install zenml`)
+- Git
+- OpenAI API key (optional, can use fake provider)
+
+### Setup
+
+```bash
+pip install -r requirements.txt
+```
+
+2. **Set up OpenAI (optional)**:
+```bash
+export OPENAI_API_KEY="your-api-key-here"
+```
+
+3. **Run the pipeline**:
+```bash
+python run.py
+```
+
+That's it! The pipeline will:
+- Clone the configured repository (default: requests library)
+- Analyze Python files and select candidates
+- Generate tests using OpenAI (or fake provider if no API key)
+- Run tests and measure coverage
+- Generate a comprehensive report comparing approaches
+
+## ⚙️ Configuration
+
+### Key Parameters
+
+You can customize the pipeline behavior by editing `configs/experiment.default.yaml`:
+
+```yaml
+# Control execution speed
+steps:
+  analyze_code:
+    parameters:
+      max_files: 3  # Limit files to analyze (faster execution)
+  
+  gen_tests_agent:
+    parameters:
+      provider: "openai"  # openai | anthropic | fake
+      model: "gpt-4o-mini"
+      max_files: 2        # Limit files for test generation
+      max_tests_per_file: 3
+
+  gen_tests_baseline:
+    parameters:
+      max_files: 2        # Match agent for fair comparison
+```
+
+### Pipeline Options
+
+```bash
+# Use fake provider (no API key needed)
+python run.py  # Uses config defaults
+
+# Force fresh execution (no caching) 
+python run.py --no-cache
+
+# Use different config
+python run.py --config configs/experiment.strict.yaml
+```
+
+## 🔬 Advanced Usage
+
+### Different Target Repositories
+
+Edit the config to point to your own repository:
+
+```yaml
+steps:
+  select_input:
+    parameters:
+      repo_url: "https://github.com/your-org/your-repo.git"
+      ref: "main"
+      target_glob: "src/**/*.py"  # Adjust path pattern
+```
+
+### Custom Prompts
+
+Create new Jinja2 templates in `prompts/`:
+
+```jinja2
+# prompts/custom_test_v3.jinja
+
+Generate {{ max_tests }} tests for:
+{{ file_path }} (complexity: {{ complexity_score }})
+
+Source:
+```python
+{{ source_code }}
+```
+
+Requirements:
+- Use pytest fixtures
+- Include edge cases
+- Mock external dependencies
+```
+
+### A/B Testing Experiments
+
+Use run templates for systematic comparisons:
+
+```bash
+# Compare prompt versions
+python scripts/run_experiment.py --config configs/experiment.default.yaml
+python scripts/run_experiment.py --config configs/experiment.strict.yaml
+
+# Compare in ZenML dashboard:
+# - Coverage metrics
+# - Test quality scores  
+# - Token usage and cost
+# - Promotion decisions
+```
+
+### Production Deployment
+
+Set up ZenML stack for cloud deployment:
+
+```bash
+# Example: AWS EKS stack
+zenml artifact-store register s3_store --flavor=s3 --path=s3://your-bucket
+zenml container-registry register ecr_registry --flavor=aws --uri=your-account.dkr.ecr.region.amazonaws.com
+zenml orchestrator register k8s_orchestrator --flavor=kubernetes --kubernetes_context=your-eks-context
+
+zenml stack register production_stack \
+  -a s3_store -c ecr_registry -o k8s_orchestrator --set
+```
+
+### Scheduled Regression
+
+Register batch regression for daily execution:
+
+```bash
+python scripts/run_batch.py --config configs/schedule.batch.yaml --schedule
+```
+
+## 🏗️ Project Structure
+
+```
+qualityflow/
+├── README.md
+├── pyproject.toml
+├── requirements.txt
+├── .env.example
+├── zenml.yaml
+│
+├── configs/                          # Pipeline configurations
+│   ├── experiment.default.yaml       # Standard experiment settings
+│   ├── experiment.strict.yaml        # High-quality gates
+│   └── schedule.batch.yaml           # Batch regression schedule
+│
+├── domain/                           # Core data models
+│   ├── schema.py                     # Pydantic models
+│   └── stages.py                     # Deployment stages
+│
+├── pipelines/                        # Pipeline definitions
+│   ├── generate_and_evaluate.py      # Experiment pipeline
+│   └── batch_regression.py           # Scheduled regression
+│
+├── steps/                            # Pipeline steps
+│   ├── select_input.py               # Source specification
+│   ├── fetch_source.py               # Repository fetching  
+│   ├── analyze_code.py               # Code analysis & selection
+│   ├── gen_tests_agent.py            # LLM test generation
+│   ├── gen_tests_baseline.py         # Heuristic test generation
+│   ├── run_tests.py                  # Test execution & coverage
+│   ├── evaluate_coverage.py          # Metrics & gate evaluation
+│   ├── compare_and_promote.py        # Model registry promotion
+│   ├── resolve_test_pack.py          # Test pack resolution
+│   └── report.py                     # Report generation
+│
+├── prompts/                          # Jinja2 prompt templates
+│   ├── unit_test_v1.jinja           # Standard test generation
+│   └── unit_test_strict_v2.jinja    # Comprehensive test generation
+│
+├── materializers/                    # Custom artifact handling
+├── utils/                           # Utility functions
+│
+├── registry/                        # Test Pack registry docs
+│   └── README.md
+│
+├── run_templates/                   # Experiment templates
+│   ├── ab_agent_vs_strict.json    # A/B testing configuration
+│   └── baseline_only.json         # Baseline establishment
+│
+├── scripts/                        # CLI scripts
+│   ├── run_experiment.py          # Experiment runner
+│   └── run_batch.py              # Batch regression runner
+│
+└── examples/                       # Demo code for testing
+    └── toy_lib/                   # Sample library
+        ├── calculator.py
+        └── string_utils.py
+```
+
+### Key Components
+
+- **Domain Models**: Pydantic schemas for type safety and validation
+- **Pipeline Steps**: Modular, reusable components with clear interfaces
+- **Prompt Templates**: Jinja2 templates for LLM test generation  
+- **Configuration**: YAML-driven experiment and deployment settings
+- **Quality Gates**: Configurable thresholds for coverage and promotion
+- **Model Registry**: ZenML Model Registry integration for test pack versioning
+
+## 🚀 Production Deployment
+
+### ZenML Cloud Stack Setup
+
+For production deployment with ZenML Cloud:
+
+```bash
+# Connect to ZenML Cloud
+zenml connect --url https://your-org.zenml.cloud
+
+# Register cloud stack components
+zenml artifact-store register cloud_store --flavor=s3 --path=s3://qualityflow-artifacts
+zenml orchestrator register cloud_k8s --flavor=kubernetes --kubernetes_context=prod-cluster
+
+zenml stack register production \
+  -a cloud_store -o cloud_k8s --set
+```
+
+### Scheduled Execution
+
+Set up automated regression testing:
+
+```bash
+# Register schedule (example with ZenML Cloud)
+python scripts/run_batch.py --config configs/schedule.batch.yaml --schedule
+
+# Monitor via dashboard:
+# - Daily regression results
+# - Coverage trend analysis  
+# - Test pack performance
+```
+
+## 🤝 Contributing
+
+QualityFlow follows ZenML best practices and is designed to be extended:
+
+1. **Add New LLM Providers**: Extend `gen_tests_agent.py` with new provider integrations
+2. **Custom Materializers**: Create materializers for new artifact types
+3. **Additional Metrics**: Expand evaluation capabilities with new quality metrics
+4. **Selection Strategies**: Add new code selection algorithms
+
+## 📝 Next Steps
+
+After running QualityFlow successfully:
+
+1. **Explore ZenML Dashboard**: View pipeline runs, artifacts, and model registry
+2. **Experiment with Prompts**: Try different test generation strategies
+3. **Add Real Codebases**: Replace toy examples with your production code
+4. **Deploy to Production**: Use cloud orchestration for scale
+5. **Set Up Monitoring**: Configure alerts for regression detection
+
+## 🆘 Troubleshooting
+
+### Common Issues
+
+**LLM API Errors**:
+- Set `OPENAI_API_KEY` or `ANTHROPIC_API_KEY` environment variables
+- Use `provider: "fake"` for development without API keys
+
+**Test Execution Failures**:
+- Ensure pytest and coverage tools are installed
+- Check that workspace has proper Python path setup
+
+### Debug Mode
+
+Run with debug logging:
+
+```bash
+export ZENML_LOGGING_VERBOSITY=DEBUG
+python scripts/run_experiment.py --config configs/experiment.default.yaml
+```
+
+## 📚 Resources
+
+- [ZenML Documentation](https://docs.zenml.io/)
+- [Model Control Plane](https://docs.zenml.io/user-guide/model-control-plane)
+- [Kubernetes Orchestrator](https://docs.zenml.io/stacks/stack-components/orchestrators/kubernetes)
+
+---
+
+Built with ❤️ using [ZenML](https://zenml.io) - *The MLOps Framework for Production AI*
\ No newline at end of file
diff --git a/qualityflow/configs/experiment.default.yaml b/qualityflow/configs/experiment.default.yaml
new file mode 100644
index 00000000..61537368
--- /dev/null
+++ b/qualityflow/configs/experiment.default.yaml
@@ -0,0 +1,41 @@
+# QualityFlow Default Experiment Configuration
+# Production-ready template for automated test generation & validation
+
+# Pipeline configuration
+name: "generate_and_evaluate"
+version: "1.0"
+
+# Source configuration
+steps:
+  select_input:
+    parameters:
+      repo_url: "https://github.com/psf/requests.git"
+      ref: "main"
+      target_glob: "src/**/*.py"
+  
+  analyze_code:
+    parameters:
+      strategy: "low_coverage"  # low_coverage | changed_files | all
+      max_files: 3  # Reduced for faster testing
+  
+  # LLM generation configuration
+  gen_tests_agent:
+    parameters:
+      provider: "openai"  # openai | anthropic | fake
+      model: "gpt-4o-mini"
+      prompt_path: "prompts/unit_test_v1.jinja"
+      max_tests_per_file: 3
+      max_files: 2  # Limit files for faster testing
+  
+  # Baseline test generation
+  gen_tests_baseline:
+    parameters:
+      enabled: true
+      max_files: 2  # Match agent max_files for consistency
+  
+  # No more evaluation gates or promotion - just simple coverage comparison
+
+# Resource configuration
+settings:
+  docker:
+    requirements: requirements.txt
\ No newline at end of file
diff --git a/qualityflow/configs/experiment.strict.yaml b/qualityflow/configs/experiment.strict.yaml
new file mode 100644
index 00000000..8d1d15ba
--- /dev/null
+++ b/qualityflow/configs/experiment.strict.yaml
@@ -0,0 +1,42 @@
+# QualityFlow Strict Experiment Configuration
+# Higher quality gates and strict prompt for comprehensive testing
+
+# Pipeline configuration
+name: "generate_and_evaluate"
+version: "1.0"
+
+# Source configuration
+steps:
+  select_input:
+    parameters:
+      repo_url: "https://github.com/psf/requests.git"
+      ref: "main"
+      target_glob: "src/**/*.py,tests/**/*.py"
+  
+  analyze_code:
+    parameters:
+      strategy: "low_coverage"
+      max_files: 5  # Fewer files for more thorough testing
+  
+  # LLM generation with strict prompt
+  gen_tests_agent:
+    parameters:
+      provider: "openai"  # openai | anthropic | fake
+      model: "gpt-4o"  # More powerful model
+      prompt_path: "prompts/unit_test_strict_v2.jinja"
+      max_tests_per_file: 5  # More tests per file
+      max_files: 5  # Match analyze_code for consistency
+  
+  # Baseline test generation
+  gen_tests_baseline:
+    parameters:
+      enabled: true
+      max_files: 5  # Match agent for fair comparison
+
+# Resource configuration with higher limits
+settings:
+  docker:
+    requirements: requirements.txt
+    resource_settings:
+      memory: "4Gi"
+      cpu_count: 2.0
\ No newline at end of file
diff --git a/qualityflow/examples/toy_lib/__init__.py b/qualityflow/examples/toy_lib/__init__.py
new file mode 100644
index 00000000..c70599d5
--- /dev/null
+++ b/qualityflow/examples/toy_lib/__init__.py
@@ -0,0 +1,5 @@
+"""
+QualityFlow toy library example for testing.
+"""
+
+__version__ = "0.1.0"
\ No newline at end of file
diff --git a/qualityflow/examples/toy_lib/calculator.py b/qualityflow/examples/toy_lib/calculator.py
new file mode 100644
index 00000000..c9ec644d
--- /dev/null
+++ b/qualityflow/examples/toy_lib/calculator.py
@@ -0,0 +1,75 @@
+"""Simple calculator module for QualityFlow demonstration."""
+
+from typing import Union
+
+
+class Calculator:
+    """A simple calculator with basic arithmetic operations."""
+
+    def __init__(self):
+        """Initialize calculator with empty history."""
+        self.history = []
+
+    def add(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]:
+        """Add two numbers."""
+        result = a + b
+        self.history.append(f"{a} + {b} = {result}")
+        return result
+
+    def subtract(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]:
+        """Subtract second number from first."""
+        result = a - b
+        self.history.append(f"{a} - {b} = {result}")
+        return result
+
+    def multiply(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]:
+        """Multiply two numbers."""
+        result = a * b
+        self.history.append(f"{a} * {b} = {result}")
+        return result
+
+    def divide(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]:
+        """Divide first number by second."""
+        if b == 0:
+            raise ValueError("Cannot divide by zero")
+        result = a / b
+        self.history.append(f"{a} / {b} = {result}")
+        return result
+
+    def power(self, base: Union[int, float], exponent: Union[int, float]) -> Union[int, float]:
+        """Raise base to the power of exponent."""
+        result = base ** exponent
+        self.history.append(f"{base} ** {exponent} = {result}")
+        return result
+
+    def clear_history(self) -> None:
+        """Clear calculation history."""
+        self.history.clear()
+
+    def get_history(self) -> list[str]:
+        """Get calculation history."""
+        return self.history.copy()
+
+
+def factorial(n: int) -> int:
+    """Calculate factorial of n."""
+    if n < 0:
+        raise ValueError("Factorial is not defined for negative numbers")
+    if n == 0 or n == 1:
+        return 1
+    return n * factorial(n - 1)
+
+
+def is_prime(n: int) -> bool:
+    """Check if a number is prime."""
+    if n < 2:
+        return False
+    if n == 2:
+        return True
+    if n % 2 == 0:
+        return False
+
+    for i in range(3, int(n**0.5) + 1, 2):
+        if n % i == 0:
+            return False
+    return True
\ No newline at end of file
diff --git a/qualityflow/examples/toy_lib/string_utils.py b/qualityflow/examples/toy_lib/string_utils.py
new file mode 100644
index 00000000..d842b500
--- /dev/null
+++ b/qualityflow/examples/toy_lib/string_utils.py
@@ -0,0 +1,120 @@
+"""
+String utility functions for QualityFlow demonstration.
+"""
+
+import re
+from typing import List, Optional
+
+
+def reverse_string(s: str) -> str:
+    """Reverse a string."""
+    if not isinstance(s, str):
+        raise TypeError("Input must be a string")
+    return s[::-1]
+
+
+def is_palindrome(s: str, ignore_case: bool = True) -> bool:
+    """Check if a string is a palindrome."""
+    if not isinstance(s, str):
+        raise TypeError("Input must be a string")
+    
+    # Clean the string - keep only alphanumeric characters
+    cleaned = re.sub(r'[^a-zA-Z0-9]', '', s)
+    
+    if ignore_case:
+        cleaned = cleaned.lower()
+    
+    return cleaned == cleaned[::-1]
+
+
+def count_words(text: str) -> int:
+    """Count words in text."""
+    if not isinstance(text, str):
+        raise TypeError("Input must be a string")
+    
+    if not text.strip():
+        return 0
+    
+    words = text.split()
+    return len(words)
+
+
+def capitalize_words(text: str) -> str:
+    """Capitalize the first letter of each word."""
+    if not isinstance(text, str):
+        raise TypeError("Input must be a string")
+    
+    return ' '.join(word.capitalize() for word in text.split())
+
+
+def extract_emails(text: str) -> List[str]:
+    """Extract email addresses from text."""
+    if not isinstance(text, str):
+        raise TypeError("Input must be a string")
+    
+    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+    return re.findall(email_pattern, text)
+
+
+def truncate_string(s: str, max_length: int, suffix: str = "...") -> str:
+    """Truncate string to maximum length with suffix."""
+    if not isinstance(s, str):
+        raise TypeError("Input must be a string")
+    if not isinstance(max_length, int) or max_length < 0:
+        raise ValueError("max_length must be a non-negative integer")
+    
+    if len(s) <= max_length:
+        return s
+    
+    if max_length <= len(suffix):
+        return s[:max_length]
+    
+    return s[:max_length - len(suffix)] + suffix
+
+
+class TextProcessor:
+    """Text processing utility class."""
+    
+    def __init__(self, default_encoding: str = "utf-8"):
+        self.default_encoding = default_encoding
+        self.processed_count = 0
+    
+    def clean_text(self, text: str, remove_punctuation: bool = False) -> str:
+        """Clean text by removing extra whitespace and optionally punctuation."""
+        if not isinstance(text, str):
+            raise TypeError("Input must be a string")
+        
+        # Remove extra whitespace
+        cleaned = ' '.join(text.split())
+        
+        if remove_punctuation:
+            # Remove punctuation except spaces
+            cleaned = re.sub(r'[^\w\s]', '', cleaned)
+        
+        self.processed_count += 1
+        return cleaned
+    
+    def word_frequency(self, text: str, ignore_case: bool = True) -> dict[str, int]:
+        """Count word frequency in text."""
+        if not isinstance(text, str):
+            raise TypeError("Input must be a string")
+        
+        words = text.split()
+        if ignore_case:
+            words = [word.lower() for word in words]
+        
+        frequency = {}
+        for word in words:
+            # Remove punctuation from word
+            clean_word = re.sub(r'[^\w]', '', word)
+            if clean_word:
+                frequency[clean_word] = frequency.get(clean_word, 0) + 1
+        
+        return frequency
+    
+    def get_stats(self) -> dict[str, int]:
+        """Get processing statistics."""
+        return {
+            "processed_count": self.processed_count,
+            "default_encoding": self.default_encoding
+        }
\ No newline at end of file
diff --git a/qualityflow/pipelines/__init__.py b/qualityflow/pipelines/__init__.py
new file mode 100644
index 00000000..525d0f58
--- /dev/null
+++ b/qualityflow/pipelines/__init__.py
@@ -0,0 +1,5 @@
+"""QualityFlow pipelines."""
+
+from .generate_and_evaluate import generate_and_evaluate
+
+__all__ = ["generate_and_evaluate"]
\ No newline at end of file
diff --git a/qualityflow/pipelines/generate_and_evaluate.py b/qualityflow/pipelines/generate_and_evaluate.py
new file mode 100644
index 00000000..c50754e3
--- /dev/null
+++ b/qualityflow/pipelines/generate_and_evaluate.py
@@ -0,0 +1,63 @@
+"""
+QualityFlow experiment pipeline for test generation and evaluation.
+"""
+
+from typing import Annotated
+
+from zenml import pipeline
+from zenml.logger import get_logger
+
+from steps.select_input import select_input
+from steps.fetch_source import fetch_source
+from steps.analyze_code import analyze_code
+from steps.gen_tests_agent import gen_tests_agent
+from steps.gen_tests_baseline import gen_tests_baseline
+from steps.run_tests import run_tests
+from steps.report import report
+
+logger = get_logger(__name__)
+
+
+@pipeline(name="generate_and_evaluate")
+def generate_and_evaluate() -> None:
+    """QualityFlow pipeline for generating and evaluating tests.
+    
+    Simple, focused pipeline:
+    1. Analyze code to find files needing tests
+    2. Generate tests using LLM and baseline approaches  
+    3. Run tests and measure coverage
+    4. Report results for comparison
+    """
+    # Step 1: Resolve source specification
+    spec = select_input()
+    
+    # Step 2: Fetch and materialize workspace
+    workspace_dir, commit_sha = fetch_source(spec)
+    
+    # Step 3: Analyze and select code files
+    code_summary = analyze_code(
+        workspace_dir, commit_sha
+    )
+    
+    # Step 4: Generate tests using LLM agent
+    agent_tests_dir, prompt_used = gen_tests_agent(
+        workspace_dir, code_summary
+    )
+    
+    # Step 5: Generate baseline tests (optional)
+    baseline_tests_dir = gen_tests_baseline(workspace_dir, code_summary)
+    
+    # Step 6: Run agent tests
+    agent_results = run_tests(workspace_dir, agent_tests_dir, label="agent")
+    
+    # Step 7: Run baseline tests (if available)
+    baseline_results = run_tests(workspace_dir, baseline_tests_dir, label="baseline")
+    
+    # Step 8: Generate comprehensive report (includes evaluation)
+    report(
+        workspace_dir,
+        commit_sha,
+        prompt_used,
+        agent_results,
+        baseline_results,
+    )
\ No newline at end of file
diff --git a/qualityflow/prompts/unit_test_strict_v2.jinja b/qualityflow/prompts/unit_test_strict_v2.jinja
new file mode 100644
index 00000000..32dd2643
--- /dev/null
+++ b/qualityflow/prompts/unit_test_strict_v2.jinja
@@ -0,0 +1,99 @@
+# Unit Test Generation Prompt v2.0 (Strict)
+# Comprehensive test generation with advanced patterns
+
+You are a senior Python test engineer with expertise in test-driven development. Generate production-grade unit tests with comprehensive coverage.
+
+## Code Analysis
+- **File**: `{{ file_path }}`
+- **Complexity Score**: {{ complexity_score }}
+- **Target Test Count**: {{ max_tests }}
+
+## Source Code
+```python
+{{ source_code }}
+```
+
+## Advanced Testing Requirements
+
+Generate {{ max_tests }} comprehensive tests covering ALL of the following:
+
+### 1. Functional Coverage
+- **Happy paths**: Normal operation scenarios
+- **Edge cases**: Boundary values, empty collections, extreme inputs
+- **Error handling**: Exception paths, invalid states
+- **State transitions**: Object lifecycle, state changes
+
+### 2. Quality Patterns
+- **Arrange-Act-Assert** structure
+- **Given-When-Then** scenarios  
+- **Property-based testing** where applicable
+- **Parameterized tests** for multiple scenarios
+
+### 3. Advanced Techniques
+- **Mock interactions**: Verify call patterns, not just return values
+- **Context managers**: Test resource cleanup
+- **Async/await**: If code contains async patterns
+- **Thread safety**: If code has concurrency
+- **Performance bounds**: Basic timing assertions
+
+### 4. Security Considerations
+- **Input sanitization**: SQL injection, XSS prevention
+- **Authorization**: Access control validation
+- **Data exposure**: Sensitive information leakage
+
+## Technical Requirements
+
+- Use `pytest` with fixtures and parametrization
+- Implement proper test isolation
+- Include integration test patterns where relevant
+- Use `hypothesis` for property-based tests when beneficial
+- Mock all external dependencies (filesystem, network, databases)
+- Test both success and failure scenarios thoroughly
+
+## Output Format
+
+Provide production-ready test code:
+
+```python
+"""
+Comprehensive unit tests for {{ file_path }}
+Generated by QualityFlow (Strict Mode)
+Coverage target: >95% line and branch coverage
+"""
+
+import pytest
+import unittest
+from unittest.mock import Mock, patch, MagicMock, call
+from hypothesis import given, strategies as st
+import tempfile
+import os
+from contextlib import contextmanager
+
+# Import the module under test
+# from {{ file_path.replace('/', '.').replace('.py', '') }} import *
+
+class Test{{ file_path.split('/')[-1].replace('.py', '').title() }}(unittest.TestCase):
+    """Comprehensive test suite for {{ file_path }}."""
+    
+    def setUp(self):
+        """Set up test fixtures and mock objects."""
+        pass
+    
+    def tearDown(self):
+        """Clean up after tests."""
+        pass
+    
+    # Generated test methods with comprehensive coverage
+    
+    @pytest.mark.parametrize("input,expected", [
+        # Add parameterized test cases
+    ])
+    def test_parametrized_scenarios(self, input, expected):
+        """Test multiple scenarios with parameterization."""
+        pass
+
+if __name__ == "__main__":
+    unittest.main()
+```
+
+Focus on realistic, maintainable tests that would pass code review in a production environment.
\ No newline at end of file
diff --git a/qualityflow/prompts/unit_test_v1.jinja b/qualityflow/prompts/unit_test_v1.jinja
new file mode 100644
index 00000000..1c1cd444
--- /dev/null
+++ b/qualityflow/prompts/unit_test_v1.jinja
@@ -0,0 +1,61 @@
+# Unit Test Generation Prompt v1.0
+# Standard test generation for QualityFlow
+
+You are an expert Python test engineer. Generate comprehensive unit tests for the following code.
+
+## Code Analysis
+- **File**: `{{ file_path }}`
+- **Complexity Score**: {{ complexity_score }}
+- **Target Test Count**: {{ max_tests }}
+
+## Source Code
+```python
+{{ source_code }}
+```
+
+## Instructions
+
+Generate {{ max_tests }} high-quality unit tests that cover:
+1. **Happy path scenarios** - typical usage patterns
+2. **Edge cases** - boundary conditions, empty inputs, None values  
+3. **Error conditions** - invalid inputs, exceptions
+4. **Integration points** - mocked dependencies where applicable
+
+## Requirements
+
+- Use `pytest` and `unittest.TestCase` patterns
+- Include proper docstrings for test methods
+- Use `unittest.mock` for external dependencies
+- Focus on behavioral testing, not implementation details
+- Ensure tests are deterministic and repeatable
+- Include setup/teardown if needed
+
+## Output Format
+
+Provide only the Python test code with no additional explanation:
+
+```python
+"""
+Unit tests for {{ file_path }}
+Generated by QualityFlow
+"""
+
+import pytest
+import unittest
+from unittest.mock import Mock, patch, MagicMock
+
+# Import the module under test
+# from {{ file_path.replace('/', '.').replace('.py', '') }} import *
+
+class TestModule(unittest.TestCase):
+    """Test suite for {{ file_path }}."""
+    
+    def setUp(self):
+        """Set up test fixtures."""
+        pass
+    
+    # Your generated test methods here
+    
+if __name__ == "__main__":
+    unittest.main()
+```
\ No newline at end of file
diff --git a/qualityflow/requirements.txt b/qualityflow/requirements.txt
new file mode 100644
index 00000000..72c59212
--- /dev/null
+++ b/qualityflow/requirements.txt
@@ -0,0 +1,22 @@
+# ZenML and Core MLOps
+zenml>=0.84.2
+
+# Core Python Libraries
+pydantic>=2.0.0,<3.0.0
+pyyaml>=6.0,<7.0
+jinja2>=3.0.0,<4.0.0
+
+# Testing Framework  
+pytest>=7.0.0,<8.0.0
+pytest-cov>=4.0.0,<5.0.0
+coverage>=7.0.0,<8.0.0
+
+# Code Analysis
+ast>=3.9
+
+# Git Integration
+gitpython>=3.1.0,<4.0.0
+
+# LLM Integration (optional)
+openai>=1.0.0,<2.0.0  # for OpenAI provider
+anthropic>=0.25.0,<1.0.0  # for Anthropic provider
\ No newline at end of file
diff --git a/qualityflow/run.py b/qualityflow/run.py
new file mode 100644
index 00000000..11751350
--- /dev/null
+++ b/qualityflow/run.py
@@ -0,0 +1,55 @@
+"""
+Entry point for running QualityFlow test generation pipeline.
+"""
+
+from pathlib import Path
+
+import click
+from pipelines import generate_and_evaluate
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@click.command()
+@click.option(
+    "--config",
+    "-c", 
+    type=click.Path(exists=True, dir_okay=False),
+    default=None,
+    required=False,
+    help="Path to configuration YAML file. Defaults to configs/experiment.default.yaml",
+)
+@click.option(
+    "--no-cache",
+    is_flag=True,
+    default=False,
+    help="Disable pipeline caching and force fresh execution",
+)
+def main(config: str | None, no_cache: bool):
+    """Run QualityFlow test generation and coverage analysis pipeline.
+    
+    Simple pipeline that generates tests using LLM, runs them, measures coverage,
+    and compares results against baseline approaches.
+    """
+
+    project_root = Path(__file__).parent
+    default_config = project_root / "configs" / "experiment.default.yaml"
+    chosen_config = config or str(default_config)
+
+    try:
+        logger.info(f"Starting QualityFlow pipeline with config: {chosen_config}")
+        pipeline_instance = generate_and_evaluate.with_options(
+            config_path=chosen_config, 
+            enable_cache=not no_cache
+        )
+        pipeline_instance()
+        logger.info("QualityFlow pipeline completed successfully!")
+
+    except Exception as e:
+        logger.error(f"Pipeline execution failed: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/qualityflow/steps/__init__.py b/qualityflow/steps/__init__.py
new file mode 100644
index 00000000..70abca08
--- /dev/null
+++ b/qualityflow/steps/__init__.py
@@ -0,0 +1,21 @@
+"""QualityFlow pipeline steps."""
+
+from .select_input import select_input
+from .fetch_source import fetch_source
+from .analyze_code import analyze_code
+from .gen_tests_agent import gen_tests_agent
+from .gen_tests_baseline import gen_tests_baseline
+from .run_tests import run_tests
+from .evaluate_coverage import evaluate_coverage
+from .report import report
+
+__all__ = [
+    "select_input",
+    "fetch_source",
+    "analyze_code", 
+    "gen_tests_agent",
+    "gen_tests_baseline",
+    "run_tests",
+    "evaluate_coverage",
+    "report",
+]
\ No newline at end of file
diff --git a/qualityflow/steps/analyze_code.py b/qualityflow/steps/analyze_code.py
new file mode 100644
index 00000000..0bfebd9c
--- /dev/null
+++ b/qualityflow/steps/analyze_code.py
@@ -0,0 +1,151 @@
+"""
+Analyze and select code files for test generation.
+"""
+
+import glob
+import ast
+import os
+from pathlib import Path
+from typing import Annotated, Dict, List, Tuple
+
+from zenml import step
+from zenml.logger import get_logger
+from enum import Enum
+
+
+class SelectionStrategy(str, Enum):
+    """Code file selection strategies."""
+    LOW_COVERAGE = "low_coverage"
+    CHANGED_FILES = "changed_files"
+    ALL = "all"
+
+logger = get_logger(__name__)
+
+
+@step
+def analyze_code(
+    workspace_dir: Path,
+    commit_sha: str,
+    target_glob: str = "src/**/*.py",
+    strategy: SelectionStrategy = SelectionStrategy.LOW_COVERAGE,
+    max_files: int = 10,
+) -> Annotated[Dict, "code_summary"]:
+    """
+    Analyze workspace and select candidate files for test generation.
+    
+    Args:
+        workspace_dir: Path to workspace directory
+        commit_sha: Git commit SHA
+        target_glob: Glob pattern for target files
+        strategy: File selection strategy
+        max_files: Maximum number of files to select
+        
+    Returns:
+        Code summary dictionary containing selected files and metadata
+    """
+    logger.info(f"Analyzing code in {workspace_dir} with strategy {strategy}")
+    
+    workspace_path = Path(workspace_dir)
+    
+    # Find all Python files matching glob pattern
+    all_files = []
+    for pattern in target_glob.split(","):
+        pattern = pattern.strip()
+        matched_files = glob.glob(str(workspace_path / pattern), recursive=True)
+        all_files.extend(matched_files)
+    
+    # Make paths relative to workspace
+    relative_files = [
+        os.path.relpath(f, workspace_dir) 
+        for f in all_files 
+        if f.endswith('.py') and os.path.isfile(f)
+    ]
+    
+    logger.info(f"Found {len(relative_files)} Python files")
+    
+    # Calculate complexity scores
+    complexity_scores = {}
+    valid_files = []
+    
+    for file_path in relative_files:
+        full_path = workspace_path / file_path
+        try:
+            with open(full_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            
+            # Parse AST and calculate basic complexity
+            tree = ast.parse(content)
+            complexity = _calculate_complexity(tree)
+            complexity_scores[file_path] = complexity
+            valid_files.append(file_path)
+            
+        except (SyntaxError, UnicodeDecodeError) as e:
+            logger.warning(f"Skipping {file_path} due to parsing error: {e}")
+            continue
+    
+    # Select files based on strategy
+    selected_files = _select_files(valid_files, complexity_scores, strategy, max_files)
+    
+    code_summary = {
+        "selected_files": selected_files,
+        "total_files": len(valid_files),
+        "selection_reason": f"Selected top {len(selected_files)} files using {strategy} strategy",
+        "complexity_scores": {f: complexity_scores[f] for f in selected_files}
+    }
+    
+    logger.info(f"Selected {len(selected_files)} files: {selected_files}")
+    
+    return code_summary
+
+
+def _calculate_complexity(tree: ast.AST) -> float:
+    """Calculate basic complexity score for an AST."""
+    class ComplexityVisitor(ast.NodeVisitor):
+        def __init__(self):
+            self.complexity = 0
+            self.functions = 0
+            self.classes = 0
+            
+        def visit_FunctionDef(self, node):
+            self.functions += 1
+            self.complexity += 1
+            for child in ast.walk(node):
+                if isinstance(child, (ast.If, ast.For, ast.While, ast.Try)):
+                    self.complexity += 1
+            self.generic_visit(node)
+            
+        def visit_ClassDef(self, node):
+            self.classes += 1
+            self.complexity += 1
+            self.generic_visit(node)
+    
+    visitor = ComplexityVisitor()
+    visitor.visit(tree)
+    
+    # Combine metrics into single score
+    return visitor.complexity + visitor.functions * 0.5 + visitor.classes * 2
+
+
+def _select_files(
+    files: List[str], 
+    complexity_scores: Dict[str, float], 
+    strategy: SelectionStrategy, 
+    max_files: int
+) -> List[str]:
+    """Select files based on strategy."""
+    
+    if strategy == SelectionStrategy.ALL:
+        return files[:max_files]
+    
+    elif strategy == SelectionStrategy.LOW_COVERAGE:
+        # Prioritize complex files that likely need more tests
+        sorted_files = sorted(files, key=lambda f: complexity_scores[f], reverse=True)
+        return sorted_files[:max_files]
+    
+    elif strategy == SelectionStrategy.CHANGED_FILES:
+        # For this demo, just return all files (in real implementation, would use git diff)
+        logger.warning("CHANGED_FILES strategy not fully implemented, falling back to ALL")
+        return files[:max_files]
+    
+    else:
+        raise ValueError(f"Unknown selection strategy: {strategy}")
\ No newline at end of file
diff --git a/qualityflow/steps/evaluate_coverage.py b/qualityflow/steps/evaluate_coverage.py
new file mode 100644
index 00000000..9d384b10
--- /dev/null
+++ b/qualityflow/steps/evaluate_coverage.py
@@ -0,0 +1,68 @@
+"""
+Evaluate coverage metrics and compare against baselines.
+"""
+
+from typing import Annotated, Dict, Optional
+from zenml import step, Model
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@step
+def evaluate_coverage(
+    agent_results: Dict,
+    baseline_results: Optional[Dict],
+    commit_sha: str,
+) -> Annotated[Dict, "evaluation_metrics"]:
+    """
+    Evaluate coverage metrics and compare agent vs baseline approaches.
+    
+    Args:
+        agent_results: Test results from agent-generated tests
+        baseline_results: Test results from baseline tests (optional)
+        commit_sha: Current commit SHA
+        
+    Returns:
+        Evaluation metrics dictionary with coverage comparison
+    """
+    logger.info("Evaluating coverage metrics and computing deltas")
+    
+    # Extract agent metrics
+    coverage_total_agent = agent_results.get("coverage_total", 0.0)
+    tests_passed_agent = agent_results.get("tests_passed", 0)
+    tests_failed_agent = agent_results.get("tests_failed", 0)
+    
+    total_tests_agent = tests_passed_agent + tests_failed_agent
+    pass_rate_agent = tests_passed_agent / total_tests_agent if total_tests_agent > 0 else 0.0
+    
+    # Extract baseline metrics
+    coverage_total_baseline = None
+    if baseline_results and not baseline_results.get("skipped", False):
+        coverage_total_baseline = baseline_results.get("coverage_total", 0.0)
+    
+    # Compare agent vs baseline coverage
+    coverage_improvement = 0.0
+    if coverage_total_baseline is not None:
+        coverage_improvement = coverage_total_agent - coverage_total_baseline
+    
+    # Analyze coverage quality
+    pass_rate_quality = "excellent" if pass_rate_agent > 0.95 else "good" if pass_rate_agent > 0.8 else "needs_improvement"
+    coverage_quality = "excellent" if coverage_total_agent > 80 else "good" if coverage_total_agent > 50 else "needs_improvement"
+    
+    evaluation_metrics = {
+        "coverage_total_agent": coverage_total_agent,
+        "coverage_total_baseline": coverage_total_baseline,
+        "coverage_improvement": coverage_improvement,
+        "tests_passed_agent": tests_passed_agent,
+        "tests_failed_agent": tests_failed_agent,
+        "pass_rate_agent": pass_rate_agent,
+        "pass_rate_quality": pass_rate_quality,
+        "coverage_quality": coverage_quality,
+        "commit_sha": commit_sha,
+        "files_analyzed": len(agent_results.get("coverage_by_file", {})),
+    }
+    
+    logger.info(f"Evaluation complete: agent_coverage={coverage_total_agent:.2f}%, baseline_coverage={coverage_total_baseline or 0:.2f}%, improvement={coverage_improvement:+.2f}%")
+    
+    return evaluation_metrics
\ No newline at end of file
diff --git a/qualityflow/steps/fetch_source.py b/qualityflow/steps/fetch_source.py
new file mode 100644
index 00000000..c117f2d2
--- /dev/null
+++ b/qualityflow/steps/fetch_source.py
@@ -0,0 +1,70 @@
+"""
+Fetch source code workspace step.
+"""
+
+import tempfile
+import subprocess
+from pathlib import Path
+from typing import Annotated, Dict, Tuple
+
+from zenml import step
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@step
+def fetch_source(
+    source_spec: Dict[str, str],
+) -> Tuple[Annotated[Path, "workspace_dir"], Annotated[str, "commit_sha"]]:
+    """
+    Fetch and materialize workspace from git repository.
+    
+    Args:
+        source_spec: Source specification from select_input step
+        
+    Returns:
+        Tuple of workspace directory path and commit SHA
+    """
+    repo_url = source_spec["repo_url"]
+    ref = source_spec["ref"]
+    
+    logger.info(f"Fetching source from {repo_url}@{ref}")
+    
+    # Create temporary workspace
+    workspace_dir = tempfile.mkdtemp(prefix="qualityflow_workspace_")
+    workspace_path = Path(workspace_dir)
+    
+    try:
+        # Clone repository
+        logger.info(f"Cloning {repo_url} to {workspace_dir}")
+        subprocess.run(
+            ["git", "clone", "--depth", "1", "--branch", ref, repo_url, workspace_dir],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        
+        # Get commit SHA
+        result = subprocess.run(
+            ["git", "rev-parse", "HEAD"],
+            cwd=workspace_dir,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        commit_sha = result.stdout.strip()
+        
+        logger.info(f"Workspace ready at {workspace_dir}, commit: {commit_sha}")
+        
+        return Path(workspace_dir), commit_sha
+        
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to fetch source: {e}")
+        raise RuntimeError(f"Git operation failed: {e.stderr}")
+    except Exception as e:
+        logger.error(f"Unexpected error fetching source: {e}")
+        # Clean up on error
+        import shutil
+        shutil.rmtree(workspace_dir, ignore_errors=True)
+        raise
\ No newline at end of file
diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py
new file mode 100644
index 00000000..8ed37d31
--- /dev/null
+++ b/qualityflow/steps/gen_tests_agent.py
@@ -0,0 +1,358 @@
+"""
+Generate tests using LLM agent.
+"""
+
+import tempfile
+from pathlib import Path
+from typing import Annotated, Dict, List, Tuple
+from jinja2 import Template
+
+from zenml import step
+from zenml.logger import get_logger
+from zenml import log_metadata
+from enum import Enum
+
+
+class GenerationProvider(str, Enum):
+    """LLM providers for test generation."""
+    OPENAI = "openai"
+    ANTHROPIC = "anthropic"
+    FAKE = "fake"
+
+logger = get_logger(__name__)
+
+
+@step
+def gen_tests_agent(
+    workspace_dir: Path,
+    code_summary: Dict,
+    provider: GenerationProvider = GenerationProvider.FAKE,
+    model: str = "gpt-4o-mini",
+    prompt_path: str = "prompts/unit_test_v1.jinja",
+    max_tests_per_file: int = 3,
+    max_files: int = 10,
+) -> Tuple[
+    Annotated[Path, "agent_tests_dir"], 
+    Annotated[str, "prompt_used"]
+]:
+    """Generate tests using LLM agent.
+    
+    Args:
+        workspace_dir: Path to workspace directory
+        code_summary: Code analysis summary containing selected files
+        provider: LLM provider to use
+        model: Model name
+        prompt_path: Path to Jinja2 prompt template
+        max_tests_per_file: Maximum tests to generate per file
+        max_files: Maximum number of files to process (for speed control)
+        
+    Returns:
+        Tuple of test directory and prompt used
+    """
+    # Extract selected files from code summary
+    selected_files = code_summary.get("selected_files", [])
+    
+    # Limit files if max_files is specified
+    files_to_process = selected_files[:max_files] if max_files > 0 else selected_files
+    logger.info(f"Generating tests for {len(files_to_process)}/{len(selected_files)} files using {provider}:{model}")
+    
+    # Create tests directory
+    tests_dir = tempfile.mkdtemp(prefix="qualityflow_agent_tests_")
+    tests_path = Path(tests_dir)
+    
+    # Load prompt template
+    workspace_path = Path(workspace_dir)
+    prompt_file = workspace_path / prompt_path
+    
+    if prompt_file.exists():
+        with open(prompt_file, 'r') as f:
+            prompt_template = f.read()
+    else:
+        # Use default template if file doesn't exist
+        prompt_template = _get_default_prompt_template()
+        logger.info(f"Using default prompt template, {prompt_path} not found")
+    
+    template = Template(prompt_template)
+    
+    total_tokens_in = 0
+    total_tokens_out = 0
+    materialized_prompts = {}  # Store materialized prompts per file
+    
+    for file_path in files_to_process:
+        logger.info(f"Generating tests for {file_path}")
+        
+        # Read source file
+        full_file_path = workspace_path / file_path
+        with open(full_file_path, 'r') as f:
+            source_code = f.read()
+        
+        # Render prompt
+        materialized_prompt = template.render(
+            file_path=file_path,
+            source_code=source_code,
+            max_tests=max_tests_per_file,
+            complexity_score=code_summary.get("complexity_scores", {}).get(file_path, 0)
+        )
+        
+        # Store the materialized prompt for this file
+        materialized_prompts[file_path] = materialized_prompt
+        
+        # Generate tests using provider
+        if provider == GenerationProvider.FAKE:
+            generated_tests, tokens = _generate_fake_tests(file_path, source_code, max_tests_per_file)
+        elif provider == GenerationProvider.OPENAI:
+            generated_tests, tokens = _generate_openai_tests(materialized_prompt, model)
+        elif provider == GenerationProvider.ANTHROPIC:
+            generated_tests, tokens = _generate_anthropic_tests(materialized_prompt, model)
+        else:
+            raise ValueError(f"Unsupported provider: {provider}")
+        
+        total_tokens_in += tokens.get("tokens_in", 0)
+        total_tokens_out += tokens.get("tokens_out", 0)
+        
+        # Save generated tests
+        test_file_name = f"test_{Path(file_path).stem}.py"
+        test_file_path = tests_path / test_file_name
+        
+        with open(test_file_path, 'w') as f:
+            f.write(generated_tests)
+        
+        logger.info(f"Generated tests saved to {test_file_path}")
+    
+    # Log comprehensive metadata including materialized prompts
+    metadata = {
+        "token_usage": {
+            "tokens_in": total_tokens_in,
+            "tokens_out": total_tokens_out,
+            "cost_estimate": _estimate_cost(total_tokens_in, total_tokens_out, provider, model),
+        },
+        "config": {
+            "provider": provider.value,
+            "model": model,
+            "prompt_template_path": prompt_path,
+            "max_tests_per_file": max_tests_per_file,
+            "files_processed": len(files_to_process),
+        },
+        "materialized_prompts": materialized_prompts,
+        "prompt_template": prompt_template,
+    }
+    
+    log_metadata(metadata)
+    logger.info(f"Test generation complete. Files: {len(files_to_process)}, Tokens: {total_tokens_in} in / {total_tokens_out} out")
+    
+    # Create a better prompt summary for the report
+    prompt_summary = f"Template: {prompt_path}\nProvider: {provider.value}\nModel: {model}\nFiles processed: {len(files_to_process)}"
+    
+    # Return Path object - ZenML will automatically materialize the folder
+    return Path(tests_dir), prompt_summary
+
+
+def _get_default_prompt_template() -> str:
+    """Default Jinja2 prompt template for test generation."""
+    return """# Generate unit tests for the following Python code
+
+File: {{ file_path }}
+Complexity Score: {{ complexity_score }}
+Max Tests: {{ max_tests }}
+
+## Source Code:
+```python
+{{ source_code }}
+```
+
+## Instructions:
+Generate {{ max_tests }} comprehensive unit tests for the functions and classes in this code.
+Focus on edge cases, error conditions, and typical usage patterns.
+
+## Generated Tests:
+"""
+
+
+def _generate_fake_tests(file_path: str, source_code: str, max_tests: int) -> Tuple[str, Dict]:
+    """Generate fake/mock tests for development/testing."""
+    # Create a simple module name from file path
+    module_name = file_path.replace('/', '.').replace('.py', '')
+    
+    test_content = f'''"""
+Generated tests for {file_path}
+"""
+
+import pytest
+import unittest
+from unittest.mock import Mock, patch, MagicMock
+
+class Test{file_path.split('/')[-1].replace('.py', '').title()}(unittest.TestCase):
+    """Auto-generated test class for {file_path}."""
+    
+    def test_module_import(self):
+        """Test that we can at least validate the test framework."""
+        # Simple test that always passes to ensure test discovery works
+        self.assertTrue(True)
+        
+    def test_basic_functionality(self):
+        """Test basic functionality."""
+        # Mock test demonstrating test execution
+        result = 1 + 1
+        self.assertEqual(result, 2)
+        
+    def test_error_handling(self):
+        """Test error handling."""
+        # Test exception handling
+        with self.assertRaises(ValueError):
+            raise ValueError("Expected test exception")
+            
+    def test_mock_usage(self):
+        """Test mock functionality."""
+        # Test using mocks
+        mock_obj = Mock()
+        mock_obj.method.return_value = "mocked_result"
+        result = mock_obj.method()
+        self.assertEqual(result, "mocked_result")
+        
+    def test_coverage_target(self):
+        """Test that generates some coverage."""
+        # Simple operations to generate coverage
+        data = {{"key": "value"}}
+        self.assertIn("key", data)
+        
+        items = [1, 2, 3, 4, 5]
+        filtered = [x for x in items if x > 3]
+        self.assertEqual(len(filtered), 2)
+
+if __name__ == "__main__":
+    unittest.main()
+'''
+    
+    tokens = {"tokens_in": 100, "tokens_out": 50}
+    return test_content, tokens
+
+
+def _generate_openai_tests(prompt: str, model: str) -> Tuple[str, Dict]:
+    """Generate tests using OpenAI API."""
+    try:
+        import openai
+        import os
+        
+        # Get API key from environment
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            logger.warning("OPENAI_API_KEY not found, using fake tests")
+            return _generate_fake_tests("openai_file", "mock_code", 3)
+        
+        client = openai.OpenAI(api_key=api_key)
+        
+        # Call OpenAI API
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": "You are a Python test generation expert. Generate comprehensive unit tests for the given code."},
+                {"role": "user", "content": prompt}
+            ],
+            max_tokens=2000,
+            temperature=0.1
+        )
+        
+        # Extract test code from response
+        generated_content = response.choices[0].message.content
+        
+        # Try to extract Python code blocks
+        if "```python" in generated_content:
+            start = generated_content.find("```python") + 9
+            end = generated_content.find("```", start)
+            test_content = generated_content[start:end].strip()
+        elif "```" in generated_content:
+            start = generated_content.find("```") + 3
+            end = generated_content.find("```", start)
+            test_content = generated_content[start:end].strip()
+        else:
+            # Use the whole response if no code blocks found
+            test_content = generated_content.strip()
+        
+        # Token usage for cost estimation
+        tokens = {
+            "tokens_in": response.usage.prompt_tokens,
+            "tokens_out": response.usage.completion_tokens
+        }
+        
+        logger.info(f"Generated tests using OpenAI {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out")
+        return test_content, tokens
+        
+    except ImportError:
+        logger.warning("OpenAI library not installed, using fake tests")
+        return _generate_fake_tests("openai_file", "mock_code", 3)
+    except Exception as e:
+        logger.error(f"Failed to generate tests with OpenAI: {e}")
+        logger.warning("Falling back to fake tests")
+        return _generate_fake_tests("openai_file", "mock_code", 3)
+
+
+def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]:
+    """Generate tests using Anthropic API."""
+    try:
+        import anthropic
+        import os
+        
+        # Get API key from environment
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            logger.warning("ANTHROPIC_API_KEY not found, using fake tests")
+            return _generate_fake_tests("anthropic_file", "mock_code", 3)
+        
+        client = anthropic.Anthropic(api_key=api_key)
+        
+        # Call Anthropic API
+        response = client.messages.create(
+            model=model,
+            max_tokens=2000,
+            temperature=0.1,
+            messages=[
+                {"role": "user", "content": f"You are a Python test generation expert. Generate comprehensive unit tests for the given code.\n\n{prompt}"}
+            ]
+        )
+        
+        # Extract test content from response
+        generated_content = response.content[0].text
+        
+        # Try to extract Python code blocks
+        if "```python" in generated_content:
+            start = generated_content.find("```python") + 9
+            end = generated_content.find("```", start)
+            test_content = generated_content[start:end].strip()
+        elif "```" in generated_content:
+            start = generated_content.find("```") + 3
+            end = generated_content.find("```", start)
+            test_content = generated_content[start:end].strip()
+        else:
+            # Use the whole response if no code blocks found
+            test_content = generated_content.strip()
+        
+        # Token usage for cost estimation
+        tokens = {
+            "tokens_in": response.usage.input_tokens,
+            "tokens_out": response.usage.output_tokens
+        }
+        
+        logger.info(f"Generated tests using Anthropic {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out")
+        return test_content, tokens
+        
+    except ImportError:
+        logger.warning("Anthropic library not installed, using fake tests")
+        return _generate_fake_tests("anthropic_file", "mock_code", 3)
+    except Exception as e:
+        logger.error(f"Failed to generate tests with Anthropic: {e}")
+        logger.warning("Falling back to fake tests")
+        return _generate_fake_tests("anthropic_file", "mock_code", 3)
+
+
+def _estimate_cost(tokens_in: int, tokens_out: int, provider: GenerationProvider, model: str) -> float:
+    """Estimate cost based on token usage."""
+    # Rough cost estimates (would need real pricing)
+    if provider == GenerationProvider.OPENAI:
+        if "gpt-4" in model:
+            return (tokens_in * 0.00003) + (tokens_out * 0.00006)
+        else:  # gpt-3.5
+            return (tokens_in * 0.0000015) + (tokens_out * 0.000002)
+    elif provider == GenerationProvider.ANTHROPIC:
+        return (tokens_in * 0.000008) + (tokens_out * 0.000024)
+    else:
+        return 0.0
\ No newline at end of file
diff --git a/qualityflow/steps/gen_tests_baseline.py b/qualityflow/steps/gen_tests_baseline.py
new file mode 100644
index 00000000..68a0a4e4
--- /dev/null
+++ b/qualityflow/steps/gen_tests_baseline.py
@@ -0,0 +1,190 @@
+"""
+Generate baseline/skeleton tests using heuristics.
+"""
+
+import tempfile
+import ast
+from pathlib import Path
+from typing import Annotated, Dict, List, Optional
+
+from zenml import step
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@step
+def gen_tests_baseline(
+    workspace_dir: Path,
+    code_summary: Dict,
+    enabled: bool = True,
+    max_files: int = 10,
+) -> Annotated[Optional[Path], "baseline_tests_dir"]:
+    """
+    Generate baseline/skeleton tests using heuristic analysis.
+    
+    Args:
+        workspace_dir: Path to workspace directory
+        code_summary: Code analysis summary containing selected files
+        enabled: Whether baseline generation is enabled
+        max_files: Maximum number of files to process
+        
+    Returns:
+        Path to baseline tests directory, or None if disabled
+    """
+    if not enabled:
+        logger.info("Baseline test generation disabled")
+        return None
+        
+    # Extract selected files from code summary
+    selected_files = code_summary.get("selected_files", [])
+    
+    # Limit files if max_files is specified
+    files_to_process = selected_files[:max_files] if max_files > 0 else selected_files
+    logger.info(f"Generating baseline tests for {len(files_to_process)}/{len(selected_files)} files")
+    
+    # Create baseline tests directory
+    tests_dir = tempfile.mkdtemp(prefix="qualityflow_baseline_tests_")
+    tests_path = Path(tests_dir)
+    
+    workspace_path = Path(workspace_dir)
+    
+    for file_path in files_to_process:
+        logger.info(f"Generating baseline tests for {file_path}")
+        
+        # Read and parse source file
+        full_file_path = workspace_path / file_path
+        with open(full_file_path, 'r') as f:
+            source_code = f.read()
+        
+        try:
+            tree = ast.parse(source_code)
+            
+            # Extract functions and classes
+            functions, classes = _extract_testable_items(tree)
+            
+            # Generate skeleton tests
+            test_content = _generate_skeleton_tests(file_path, functions, classes)
+            
+            # Save baseline tests
+            test_file_name = f"test_{Path(file_path).stem}_baseline.py"
+            test_file_path = tests_path / test_file_name
+            
+            with open(test_file_path, 'w') as f:
+                f.write(test_content)
+            
+            logger.info(f"Baseline tests saved to {test_file_path}")
+            
+        except SyntaxError as e:
+            logger.warning(f"Skipping {file_path} due to syntax error: {e}")
+            continue
+    
+    logger.info("Baseline test generation complete")
+    
+    # Return Path object - ZenML will automatically materialize the folder
+    return Path(tests_dir)
+
+
+def _extract_testable_items(tree: ast.AST) -> tuple[List[str], List[str]]:
+    """Extract function and class names from AST."""
+    functions = []
+    classes = []
+    
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef):
+            # Skip private functions (starting with _)
+            if not node.name.startswith('_'):
+                functions.append(node.name)
+        elif isinstance(node, ast.ClassDef):
+            # Skip private classes
+            if not node.name.startswith('_'):
+                classes.append(node.name)
+    
+    return functions, classes
+
+
+def _generate_skeleton_tests(file_path: str, functions: List[str], classes: List[str]) -> str:
+    """Generate skeleton test content."""
+    
+    # Create imports section
+    imports = f'''"""
+Baseline/skeleton tests for {file_path}
+Generated using heuristic analysis.
+"""
+
+import pytest
+import unittest
+from unittest.mock import Mock, patch
+'''
+    
+    # Try to determine import path from file path
+    module_path = file_path.replace('/', '.').replace('.py', '')
+    if module_path.startswith('src.'):
+        module_path = module_path[4:]  # Remove 'src.' prefix
+    
+    if functions or classes:
+        imports += f"# from {module_path} import {', '.join(functions + classes)}\n\n"
+    else:
+        imports += f"# from {module_path} import *\n\n"
+    
+    # Generate function tests
+    function_tests = ""
+    for func_name in functions:
+        function_tests += f'''
+def test_{func_name}_basic():
+    """Basic test for {func_name}."""
+    # TODO: Implement test for {func_name}
+    pass
+
+def test_{func_name}_error_cases():
+    """Error case test for {func_name}.""" 
+    # TODO: Test error conditions for {func_name}
+    pass
+'''
+    
+    # Generate class tests
+    class_tests = ""
+    for class_name in classes:
+        class_tests += f'''
+class Test{class_name}(unittest.TestCase):
+    """Test suite for {class_name}."""
+    
+    def setUp(self):
+        """Set up test fixtures."""
+        # TODO: Initialize test fixtures
+        pass
+    
+    def test_{class_name.lower()}_init(self):
+        """Test {class_name} initialization."""
+        # TODO: Test class initialization
+        pass
+        
+    def test_{class_name.lower()}_methods(self):
+        """Test {class_name} methods."""
+        # TODO: Test class methods
+        pass
+'''
+    
+    # Add default test if no functions or classes found
+    if not functions and not classes:
+        default_test = '''
+class TestModule(unittest.TestCase):
+    """Default test suite for module."""
+    
+    def test_module_imports(self):
+        """Test that module can be imported."""
+        # TODO: Add import test
+        pass
+'''
+        class_tests += default_test
+    
+    # Combine all parts
+    test_content = imports + function_tests + class_tests
+    
+    # Add main block
+    test_content += '''
+if __name__ == "__main__":
+    unittest.main()
+'''
+    
+    return test_content
\ No newline at end of file
diff --git a/qualityflow/steps/report.py b/qualityflow/steps/report.py
new file mode 100644
index 00000000..141ecda2
--- /dev/null
+++ b/qualityflow/steps/report.py
@@ -0,0 +1,238 @@
+"""
+Generate comprehensive pipeline report.
+"""
+
+import tempfile
+from pathlib import Path
+from typing import Annotated, Dict, Optional
+from datetime import datetime
+
+from zenml import step
+from zenml.logger import get_logger
+from zenml.types import MarkdownString
+
+logger = get_logger(__name__)
+
+
+@step
+def report(
+    workspace_dir: Path,
+    commit_sha: str,
+    prompt_used: str,
+    agent_results: Dict,
+    baseline_results: Optional[Dict],
+) -> Annotated[MarkdownString, "final_report"]:
+    """
+    Generate comprehensive markdown report for pipeline execution.
+    
+    Args:
+        workspace_dir: Workspace directory path
+        commit_sha: Git commit SHA
+        prompt_used: Prompt template used
+        agent_results: Agent test results
+        baseline_results: Baseline test results (optional)
+        
+    Returns:
+        Path to generated markdown report
+    """
+    logger.info("Generating pipeline execution report")
+    
+    # Create report file
+    report_file = Path(tempfile.mkdtemp(prefix="qualityflow_report_")) / "report.md"
+    
+    # Evaluate coverage metrics first
+    evaluation_metrics = _evaluate_coverage_metrics(agent_results, baseline_results, commit_sha)
+    
+    # Generate report content
+    report_content = _generate_report_content(
+        workspace_dir,
+        commit_sha,
+        prompt_used,
+        agent_results,
+        baseline_results,
+        evaluation_metrics,
+    )
+    
+    # Write report file
+    with open(report_file, 'w') as f:
+        f.write(report_content)
+    
+    logger.info(f"Report generated: {report_file}")
+    
+    # Return as MarkdownString for dashboard visualization
+    return MarkdownString(report_content)
+
+
+def _evaluate_coverage_metrics(
+    agent_results: Dict,
+    baseline_results: Optional[Dict],
+    commit_sha: str,
+) -> Dict:
+    """Evaluate coverage metrics and compare agent vs baseline approaches."""
+    
+    # Extract agent metrics - use actual values from test results
+    coverage_total_agent = agent_results.get("coverage_total", 0.0)
+    tests_passed_agent = agent_results.get("tests_passed", 0)
+    tests_failed_agent = agent_results.get("tests_failed", 0)
+    
+    total_tests_agent = tests_passed_agent + tests_failed_agent
+    pass_rate_agent = tests_passed_agent / total_tests_agent if total_tests_agent > 0 else 0.0
+    
+    # Extract baseline metrics
+    coverage_total_baseline = 0.0
+    if baseline_results and not baseline_results.get("skipped", False):
+        coverage_total_baseline = baseline_results.get("coverage_total", 0.0)
+    
+    # Compare agent vs baseline coverage
+    coverage_improvement = coverage_total_agent - coverage_total_baseline
+    
+    # Analyze coverage quality
+    pass_rate_quality = "excellent" if pass_rate_agent > 0.95 else "good" if pass_rate_agent > 0.8 else "needs_improvement"
+    coverage_quality = "excellent" if coverage_total_agent > 80 else "good" if coverage_total_agent > 50 else "needs_improvement"
+    
+    return {
+        "coverage_total_agent": coverage_total_agent,
+        "coverage_total_baseline": coverage_total_baseline,
+        "coverage_improvement": coverage_improvement,
+        "tests_passed_agent": tests_passed_agent,
+        "tests_failed_agent": tests_failed_agent,
+        "pass_rate_agent": pass_rate_agent,
+        "pass_rate_quality": pass_rate_quality,
+        "coverage_quality": coverage_quality,
+        "commit_sha": commit_sha,
+        "files_analyzed": len(agent_results.get("coverage_by_file", {})),
+    }
+
+
+def _generate_report_content(
+    workspace_dir: Path,
+    commit_sha: str,
+    prompt_used: str,
+    agent_results: Dict,
+    baseline_results: Optional[Dict],
+    evaluation_metrics: Dict,
+) -> str:
+    """Generate markdown report content."""
+    
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    
+    # Header
+    report = f"""# QualityFlow Pipeline Report
+
+Generated: {timestamp}
+Commit: `{commit_sha}`
+Workspace: `{workspace_dir}`
+
+## Executive Summary
+
+"""
+    
+    # Executive summary
+    coverage_agent = evaluation_metrics.get("coverage_total_agent", 0.0)
+    coverage_baseline = evaluation_metrics.get("coverage_total_baseline", 0.0)
+    improvement = evaluation_metrics.get("coverage_improvement", 0.0)
+    quality = evaluation_metrics.get("coverage_quality", "unknown")
+    
+    quality_emoji = "🟢" if quality == "excellent" else "🟡" if quality == "good" else "🔴"
+    improvement_emoji = "📈" if improvement > 0 else "📉" if improvement < 0 else "➡️"
+    
+    report += f"""{quality_emoji} **Coverage Quality**: {quality.upper()}
+{improvement_emoji} **Agent vs Baseline**: {coverage_agent:.2f}% vs {coverage_baseline:.2f}% ({improvement:+.2f}%)  
+🧪 **Tests**: {agent_results.get('tests_passed', 0)} passed, {agent_results.get('tests_failed', 0)} failed
+📁 **Files**: {evaluation_metrics.get('files_analyzed', 0)} analyzed
+
+"""
+    
+    # Agent results section
+    report += """## Agent Test Results
+
+"""
+    
+    if agent_results.get("skipped", False):
+        report += "Agent tests were skipped.\n\n"
+    else:
+        report += f"""- **Tests Passed**: {agent_results.get('tests_passed', 0)}
+- **Tests Failed**: {agent_results.get('tests_failed', 0)}
+- **Pass Rate**: {evaluation_metrics.get('pass_rate_agent', 0.0):.1%}
+- **Coverage**: {agent_results.get('coverage_total', 0.0):.2f}%
+- **JUnit Report**: `{agent_results.get('junit_path', 'N/A')}`
+- **Coverage Report**: `{agent_results.get('coverage_path', 'N/A')}`
+- **Logs**: `{agent_results.get('logs_path', 'N/A')}`
+
+"""
+    
+    # Baseline results section (if available)
+    if baseline_results and not baseline_results.get("skipped", False):
+        report += """## Baseline Test Results
+
+"""
+        report += f"""- **Tests Passed**: {baseline_results.get('tests_passed', 0)}
+- **Tests Failed**: {baseline_results.get('tests_failed', 0)}
+- **Coverage**: {baseline_results.get('coverage_total', 0.0):.2f}%
+- **JUnit Report**: `{baseline_results.get('junit_path', 'N/A')}`
+- **Coverage Report**: `{baseline_results.get('coverage_path', 'N/A')}`
+
+"""
+    
+    # Evaluation metrics section
+    report += """## Coverage Analysis
+
+"""
+    
+    pass_rate = evaluation_metrics.get("pass_rate_agent", 0.0)
+    pass_quality = evaluation_metrics.get("pass_rate_quality", "unknown")
+    
+    report += f"""- **Agent Coverage**: {coverage_agent:.2f}% ({quality})
+- **Baseline Coverage**: {coverage_baseline:.2f}%
+- **Improvement**: {improvement:+.2f}%
+- **Test Pass Rate**: {pass_rate:.1%} ({pass_quality})
+- **Files Analyzed**: {evaluation_metrics.get('files_analyzed', 0)}
+
+"""
+    
+    # Recommendations section
+    report += """## Recommendations
+
+"""
+    if quality == "excellent":
+        report += "🎉 **Excellent coverage!** Consider this approach for production use.\n"
+    elif quality == "good":
+        report += "👍 **Good coverage.** Consider tweaking prompts or selection strategy for improvement.\n"
+    else:
+        report += "⚠️ **Coverage needs improvement.** Try different prompts, models, or increase max_tests_per_file.\n"
+    
+    if improvement > 5:
+        report += "📈 **Agent significantly outperforms baseline** - LLM approach is working well.\n"
+    elif improvement > 0:
+        report += "📊 **Agent slightly better than baseline** - room for optimization.\n"
+    else:
+        report += "📉 **Baseline performs as well or better** - review agent configuration.\n"
+    
+    # Configuration section
+    report += """## Configuration
+
+### Prompt Template
+```
+"""
+    report += prompt_used[:500] + ("..." if len(prompt_used) > 500 else "")
+    report += """
+```
+
+### File Coverage Details
+"""
+    
+    coverage_by_file = agent_results.get("coverage_by_file", {})
+    if coverage_by_file:
+        report += "| File | Coverage |\n|------|----------|\n"
+        for file_path, coverage_pct in sorted(coverage_by_file.items()):
+            report += f"| `{file_path}` | {coverage_pct:.1f}% |\n"
+    else:
+        report += "No file-level coverage data available.\n"
+    
+    report += """
+
+---
+*Generated by QualityFlow - Production-ready test generation with ZenML*
+"""
+    
+    return report
\ No newline at end of file
diff --git a/qualityflow/steps/run_tests.py b/qualityflow/steps/run_tests.py
new file mode 100644
index 00000000..a4d12385
--- /dev/null
+++ b/qualityflow/steps/run_tests.py
@@ -0,0 +1,258 @@
+"""
+Run tests and collect coverage metrics.
+"""
+
+import subprocess
+import tempfile
+import shutil
+from pathlib import Path
+from typing import Annotated, Dict, Optional
+
+from zenml import step
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@step
+def run_tests(
+    workspace_dir: Path,
+    tests_dir: Optional[Path],
+    label: str = "tests",
+) -> Annotated[Dict, "test_results"]:
+    """Run tests and collect coverage metrics.
+    
+    Args:
+        workspace_dir: Path to workspace directory
+        tests_dir: Path object to tests directory (None if no tests)
+        label: Label for this test run
+        
+    Returns:
+        Dictionary containing test results and metrics
+    """
+    if tests_dir is None:
+        logger.info(f"No tests directory provided for {label}, skipping")
+        return {
+            "label": label,
+            "tests_passed": 0,
+            "tests_failed": 0,
+            "coverage_total": 0.0,
+            "coverage_by_file": {},
+            "junit_path": None,
+            "coverage_path": None,
+            "logs_path": None,
+            "skipped": True,
+        }
+    
+    logger.info(f"Running {label} tests from {tests_dir}")
+    
+    # Create output directory for this test run
+    output_dir = tempfile.mkdtemp(prefix=f"qualityflow_{label}_results_")
+    output_path = Path(output_dir)
+    
+    junit_file = output_path / "junit.xml"
+    coverage_file = output_path / "coverage.xml"
+    logs_file = output_path / "test_logs.txt"
+    
+    # Copy tests to workspace (pytest needs them in PYTHONPATH)
+    workspace_tests_dir = Path(workspace_dir) / f"tests_{label}"
+    if workspace_tests_dir.exists():
+        shutil.rmtree(workspace_tests_dir)
+    shutil.copytree(tests_dir, workspace_tests_dir)
+    
+    try:
+        # Run pytest with coverage
+        pytest_cmd = [
+            "python", "-m", "pytest",
+            str(workspace_tests_dir),
+            "--junitxml", str(junit_file),
+            "--cov", str(workspace_dir),
+            "--cov-report", f"xml:{coverage_file}",
+            "--cov-report", "term",
+            "-v"
+        ]
+        
+        logger.info(f"Running command: {' '.join(pytest_cmd)}")
+        logger.info(f"Working directory: {workspace_dir}")
+        logger.info(f"Test directory: {workspace_tests_dir}")
+        
+        # Debug: list test files
+        if workspace_tests_dir.exists():
+            test_files = list(workspace_tests_dir.glob("*.py"))
+            logger.info(f"Test files found: {[f.name for f in test_files]}")
+        else:
+            logger.warning(f"Test directory does not exist: {workspace_tests_dir}")
+        
+        result = subprocess.run(
+            pytest_cmd,
+            cwd=str(workspace_dir),
+            capture_output=True,
+            text=True,
+            timeout=300,  # 5 minute timeout
+        )
+        
+        # Save logs and also log to console for debugging
+        with open(logs_file, 'w') as f:
+            f.write(f"Command: {' '.join(pytest_cmd)}\n")
+            f.write(f"Return code: {result.returncode}\n\n")
+            f.write("STDOUT:\n")
+            f.write(result.stdout)
+            f.write("\nSTDERR:\n") 
+            f.write(result.stderr)
+        
+        # Also log the pytest output for debugging
+        logger.info(f"Pytest return code: {result.returncode}")
+        if result.stdout:
+            logger.info(f"Pytest stdout: {result.stdout}")
+        if result.stderr:
+            logger.info(f"Pytest stderr: {result.stderr}")
+        
+        # Parse results
+        test_results = _parse_test_results(
+            result, junit_file, coverage_file, logs_file, label
+        )
+        
+        logger.info(f"Test run complete for {label}: {test_results['tests_passed']} passed, {test_results['tests_failed']} failed, {test_results['coverage_total']:.2f}% coverage")
+        
+        return test_results
+        
+    except subprocess.TimeoutExpired:
+        logger.error(f"Test run for {label} timed out after 5 minutes")
+        return {
+            "label": label,
+            "tests_passed": 0,
+            "tests_failed": 1,
+            "coverage_total": 0.0,
+            "coverage_by_file": {},
+            "junit_path": str(junit_file) if junit_file.exists() else None,
+            "coverage_path": str(coverage_file) if coverage_file.exists() else None,
+            "logs_path": str(logs_file),
+            "error": "Test execution timed out",
+        }
+        
+    except Exception as e:
+        logger.error(f"Failed to run tests for {label}: {e}")
+        return {
+            "label": label,
+            "tests_passed": 0,
+            "tests_failed": 1,
+            "coverage_total": 0.0,
+            "coverage_by_file": {},
+            "junit_path": str(junit_file) if junit_file.exists() else None,
+            "coverage_path": str(coverage_file) if coverage_file.exists() else None,
+            "logs_path": str(logs_file) if logs_file.exists() else None,
+            "error": str(e),
+        }
+    
+    finally:
+        # Clean up copied tests
+        if workspace_tests_dir.exists():
+            shutil.rmtree(workspace_tests_dir, ignore_errors=True)
+
+
+def _parse_test_results(
+    result: subprocess.CompletedProcess,
+    junit_file: Path,
+    coverage_file: Path, 
+    logs_file: Path,
+    label: str,
+) -> Dict:
+    """Parse test execution results."""
+    
+    # Parse pytest output for basic stats
+    tests_passed = 0
+    tests_failed = 0
+    
+    if result.stdout:
+        lines = result.stdout.split('\n')
+        for line in lines:
+            if ' passed' in line and ' failed' in line:
+                # Line like "2 failed, 3 passed in 1.23s"
+                parts = line.split()
+                for i, part in enumerate(parts):
+                    if part == 'passed' and i > 0:
+                        tests_passed = int(parts[i-1])
+                    elif part == 'failed' and i > 0:
+                        tests_failed = int(parts[i-1])
+            elif ' passed' in line and 'failed' not in line:
+                # Line like "5 passed in 1.23s"
+                parts = line.split()
+                for i, part in enumerate(parts):
+                    if part == 'passed' and i > 0:
+                        tests_passed = int(parts[i-1])
+    
+    # Parse coverage from XML if available
+    coverage_total = 0.0
+    coverage_by_file = {}
+    
+    if coverage_file.exists():
+        coverage_total, coverage_by_file = _parse_coverage_xml(coverage_file)
+    
+    return {
+        "label": label,
+        "tests_passed": tests_passed,
+        "tests_failed": tests_failed,
+        "coverage_total": coverage_total,
+        "coverage_by_file": coverage_by_file,
+        "junit_path": str(junit_file) if junit_file.exists() else None,
+        "coverage_path": str(coverage_file) if coverage_file.exists() else None,
+        "logs_path": str(logs_file),
+        "return_code": result.returncode,
+    }
+
+
+def _parse_coverage_xml(coverage_file: Path) -> tuple[float, Dict[str, float]]:
+    """Parse coverage XML file."""
+    try:
+        import xml.etree.ElementTree as ET
+        
+        tree = ET.parse(coverage_file)
+        root = tree.getroot()
+        
+        # Debug: log the XML structure
+        logger.info(f"Coverage XML root tag: {root.tag}")
+        logger.info(f"Coverage XML root attribs: {root.attrib}")
+        
+        # Get overall coverage - try different formats
+        coverage_total = 0.0
+        
+        # Modern pytest-cov uses 'coverage' as root element
+        if root.tag == 'coverage':
+            line_rate = root.get('line-rate', '0')
+            if line_rate != '0':
+                coverage_total = float(line_rate) * 100
+                logger.info(f"Found line-rate in coverage root: {line_rate}")
+        else:
+            # Try finding coverage element nested
+            coverage_element = root.find('.//coverage')
+            if coverage_element is not None:
+                line_rate = coverage_element.get('line-rate', '0')
+                coverage_total = float(line_rate) * 100
+                logger.info(f"Found coverage element with line-rate: {line_rate}")
+        
+        # If still no coverage found, try branches-valid attribute (alternative format)
+        if coverage_total == 0.0:
+            branches_valid = root.get('branches-valid', '0')
+            branches_covered = root.get('branches-covered', '0') 
+            lines_valid = root.get('lines-valid', '0')
+            lines_covered = root.get('lines-covered', '0')
+            
+            if lines_valid != '0':
+                line_coverage = float(lines_covered) / float(lines_valid)
+                coverage_total = line_coverage * 100
+                logger.info(f"Calculated coverage from lines: {lines_covered}/{lines_valid} = {coverage_total:.2f}%")
+        
+        # Get per-file coverage
+        coverage_by_file = {}
+        for class_elem in root.findall('.//class'):
+            filename = class_elem.get('filename', '')
+            line_rate = class_elem.get('line-rate', '0')
+            if filename:
+                coverage_by_file[filename] = float(line_rate) * 100
+        
+        logger.info(f"Parsed coverage: {coverage_total}% total, {len(coverage_by_file)} files")
+        return coverage_total, coverage_by_file
+        
+    except Exception as e:
+        logger.warning(f"Failed to parse coverage XML: {e}")
+        return 0.0, {}
\ No newline at end of file
diff --git a/qualityflow/steps/select_input.py b/qualityflow/steps/select_input.py
new file mode 100644
index 00000000..c274e8df
--- /dev/null
+++ b/qualityflow/steps/select_input.py
@@ -0,0 +1,38 @@
+"""
+Select input source specification step.
+"""
+
+from typing import Annotated, Dict
+from zenml import step
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@step
+def select_input(
+    repo_url: str = "https://github.com/psf/requests.git",
+    ref: str = "main",
+    target_glob: str = "src/**/*.py",
+) -> Annotated[Dict[str, str], "source_spec"]:
+    """
+    Resolve source specification for test generation.
+    
+    Args:
+        repo_url: Repository URL to analyze
+        ref: Git reference (branch, tag, commit)
+        target_glob: Glob pattern for target files
+        
+    Returns:
+        Source specification dictionary
+    """
+    logger.info(f"Selecting input source: {repo_url}@{ref}")
+    
+    spec = {
+        "repo_url": repo_url,
+        "ref": ref,
+        "target_glob": target_glob,
+    }
+    
+    logger.info(f"Source spec: {spec}")
+    return spec
\ No newline at end of file

From 3656f91c41c7d5be0ae424e16448dfbac55b0e92 Mon Sep 17 00:00:00 2001
From: Hamza Tahir <hamza@zenml.io>
Date: Sun, 24 Aug 2025 22:30:28 +0200
Subject: [PATCH 2/8] Formattingg

---
 qualityflow/examples/toy_lib/__init__.py      |   2 +-
 qualityflow/examples/toy_lib/calculator.py    |  24 ++-
 qualityflow/examples/toy_lib/string_utils.py  |  64 ++++---
 qualityflow/pipelines/__init__.py             |   2 +-
 .../pipelines/generate_and_evaluate.py        |  45 ++---
 qualityflow/run.py                            |  13 +-
 qualityflow/steps/__init__.py                 |  12 +-
 qualityflow/steps/analyze_code.py             |  89 +++++----
 qualityflow/steps/evaluate_coverage.py        |  51 +++--
 qualityflow/steps/fetch_source.py             |  38 ++--
 qualityflow/steps/gen_tests_agent.py          | 180 ++++++++++--------
 qualityflow/steps/gen_tests_baseline.py       |  96 +++++-----
 qualityflow/steps/report.py                   | 136 +++++++------
 qualityflow/steps/run_tests.py                | 168 +++++++++-------
 qualityflow/steps/select_input.py             |  11 +-
 15 files changed, 534 insertions(+), 397 deletions(-)

diff --git a/qualityflow/examples/toy_lib/__init__.py b/qualityflow/examples/toy_lib/__init__.py
index c70599d5..8b91a8dd 100644
--- a/qualityflow/examples/toy_lib/__init__.py
+++ b/qualityflow/examples/toy_lib/__init__.py
@@ -2,4 +2,4 @@
 QualityFlow toy library example for testing.
 """
 
-__version__ = "0.1.0"
\ No newline at end of file
+__version__ = "0.1.0"
diff --git a/qualityflow/examples/toy_lib/calculator.py b/qualityflow/examples/toy_lib/calculator.py
index c9ec644d..38bc9964 100644
--- a/qualityflow/examples/toy_lib/calculator.py
+++ b/qualityflow/examples/toy_lib/calculator.py
@@ -10,25 +10,33 @@ def __init__(self):
         """Initialize calculator with empty history."""
         self.history = []
 
-    def add(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]:
+    def add(
+        self, a: Union[int, float], b: Union[int, float]
+    ) -> Union[int, float]:
         """Add two numbers."""
         result = a + b
         self.history.append(f"{a} + {b} = {result}")
         return result
 
-    def subtract(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]:
+    def subtract(
+        self, a: Union[int, float], b: Union[int, float]
+    ) -> Union[int, float]:
         """Subtract second number from first."""
         result = a - b
         self.history.append(f"{a} - {b} = {result}")
         return result
 
-    def multiply(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]:
+    def multiply(
+        self, a: Union[int, float], b: Union[int, float]
+    ) -> Union[int, float]:
         """Multiply two numbers."""
         result = a * b
         self.history.append(f"{a} * {b} = {result}")
         return result
 
-    def divide(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float]:
+    def divide(
+        self, a: Union[int, float], b: Union[int, float]
+    ) -> Union[int, float]:
         """Divide first number by second."""
         if b == 0:
             raise ValueError("Cannot divide by zero")
@@ -36,9 +44,11 @@ def divide(self, a: Union[int, float], b: Union[int, float]) -> Union[int, float
         self.history.append(f"{a} / {b} = {result}")
         return result
 
-    def power(self, base: Union[int, float], exponent: Union[int, float]) -> Union[int, float]:
+    def power(
+        self, base: Union[int, float], exponent: Union[int, float]
+    ) -> Union[int, float]:
         """Raise base to the power of exponent."""
-        result = base ** exponent
+        result = base**exponent
         self.history.append(f"{base} ** {exponent} = {result}")
         return result
 
@@ -72,4 +82,4 @@ def is_prime(n: int) -> bool:
     for i in range(3, int(n**0.5) + 1, 2):
         if n % i == 0:
             return False
-    return True
\ No newline at end of file
+    return True
diff --git a/qualityflow/examples/toy_lib/string_utils.py b/qualityflow/examples/toy_lib/string_utils.py
index d842b500..276509ab 100644
--- a/qualityflow/examples/toy_lib/string_utils.py
+++ b/qualityflow/examples/toy_lib/string_utils.py
@@ -3,7 +3,7 @@
 """
 
 import re
-from typing import List, Optional
+from typing import List
 
 
 def reverse_string(s: str) -> str:
@@ -17,13 +17,13 @@ def is_palindrome(s: str, ignore_case: bool = True) -> bool:
     """Check if a string is a palindrome."""
     if not isinstance(s, str):
         raise TypeError("Input must be a string")
-    
+
     # Clean the string - keep only alphanumeric characters
-    cleaned = re.sub(r'[^a-zA-Z0-9]', '', s)
-    
+    cleaned = re.sub(r"[^a-zA-Z0-9]", "", s)
+
     if ignore_case:
         cleaned = cleaned.lower()
-    
+
     return cleaned == cleaned[::-1]
 
 
@@ -31,10 +31,10 @@ def count_words(text: str) -> int:
     """Count words in text."""
     if not isinstance(text, str):
         raise TypeError("Input must be a string")
-    
+
     if not text.strip():
         return 0
-    
+
     words = text.split()
     return len(words)
 
@@ -43,16 +43,16 @@ def capitalize_words(text: str) -> str:
     """Capitalize the first letter of each word."""
     if not isinstance(text, str):
         raise TypeError("Input must be a string")
-    
-    return ' '.join(word.capitalize() for word in text.split())
+
+    return " ".join(word.capitalize() for word in text.split())
 
 
 def extract_emails(text: str) -> List[str]:
     """Extract email addresses from text."""
     if not isinstance(text, str):
         raise TypeError("Input must be a string")
-    
-    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+
+    email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
     return re.findall(email_pattern, text)
 
 
@@ -62,59 +62,61 @@ def truncate_string(s: str, max_length: int, suffix: str = "...") -> str:
         raise TypeError("Input must be a string")
     if not isinstance(max_length, int) or max_length < 0:
         raise ValueError("max_length must be a non-negative integer")
-    
+
     if len(s) <= max_length:
         return s
-    
+
     if max_length <= len(suffix):
         return s[:max_length]
-    
-    return s[:max_length - len(suffix)] + suffix
+
+    return s[: max_length - len(suffix)] + suffix
 
 
 class TextProcessor:
     """Text processing utility class."""
-    
+
     def __init__(self, default_encoding: str = "utf-8"):
         self.default_encoding = default_encoding
         self.processed_count = 0
-    
+
     def clean_text(self, text: str, remove_punctuation: bool = False) -> str:
         """Clean text by removing extra whitespace and optionally punctuation."""
         if not isinstance(text, str):
             raise TypeError("Input must be a string")
-        
+
         # Remove extra whitespace
-        cleaned = ' '.join(text.split())
-        
+        cleaned = " ".join(text.split())
+
         if remove_punctuation:
             # Remove punctuation except spaces
-            cleaned = re.sub(r'[^\w\s]', '', cleaned)
-        
+            cleaned = re.sub(r"[^\w\s]", "", cleaned)
+
         self.processed_count += 1
         return cleaned
-    
-    def word_frequency(self, text: str, ignore_case: bool = True) -> dict[str, int]:
+
+    def word_frequency(
+        self, text: str, ignore_case: bool = True
+    ) -> dict[str, int]:
         """Count word frequency in text."""
         if not isinstance(text, str):
             raise TypeError("Input must be a string")
-        
+
         words = text.split()
         if ignore_case:
             words = [word.lower() for word in words]
-        
+
         frequency = {}
         for word in words:
             # Remove punctuation from word
-            clean_word = re.sub(r'[^\w]', '', word)
+            clean_word = re.sub(r"[^\w]", "", word)
             if clean_word:
                 frequency[clean_word] = frequency.get(clean_word, 0) + 1
-        
+
         return frequency
-    
+
     def get_stats(self) -> dict[str, int]:
         """Get processing statistics."""
         return {
             "processed_count": self.processed_count,
-            "default_encoding": self.default_encoding
-        }
\ No newline at end of file
+            "default_encoding": self.default_encoding,
+        }
diff --git a/qualityflow/pipelines/__init__.py b/qualityflow/pipelines/__init__.py
index 525d0f58..af93c207 100644
--- a/qualityflow/pipelines/__init__.py
+++ b/qualityflow/pipelines/__init__.py
@@ -2,4 +2,4 @@
 
 from .generate_and_evaluate import generate_and_evaluate
 
-__all__ = ["generate_and_evaluate"]
\ No newline at end of file
+__all__ = ["generate_and_evaluate"]
diff --git a/qualityflow/pipelines/generate_and_evaluate.py b/qualityflow/pipelines/generate_and_evaluate.py
index c50754e3..e359afb8 100644
--- a/qualityflow/pipelines/generate_and_evaluate.py
+++ b/qualityflow/pipelines/generate_and_evaluate.py
@@ -2,18 +2,15 @@
 QualityFlow experiment pipeline for test generation and evaluation.
 """
 
-from typing import Annotated
-
-from zenml import pipeline
-from zenml.logger import get_logger
-
-from steps.select_input import select_input
-from steps.fetch_source import fetch_source
 from steps.analyze_code import analyze_code
+from steps.fetch_source import fetch_source
 from steps.gen_tests_agent import gen_tests_agent
 from steps.gen_tests_baseline import gen_tests_baseline
-from steps.run_tests import run_tests
 from steps.report import report
+from steps.run_tests import run_tests
+from steps.select_input import select_input
+from zenml import pipeline
+from zenml.logger import get_logger
 
 logger = get_logger(__name__)
 
@@ -21,38 +18,36 @@
 @pipeline(name="generate_and_evaluate")
 def generate_and_evaluate() -> None:
     """QualityFlow pipeline for generating and evaluating tests.
-    
+
     Simple, focused pipeline:
     1. Analyze code to find files needing tests
-    2. Generate tests using LLM and baseline approaches  
+    2. Generate tests using LLM and baseline approaches
     3. Run tests and measure coverage
     4. Report results for comparison
     """
     # Step 1: Resolve source specification
     spec = select_input()
-    
+
     # Step 2: Fetch and materialize workspace
     workspace_dir, commit_sha = fetch_source(spec)
-    
+
     # Step 3: Analyze and select code files
-    code_summary = analyze_code(
-        workspace_dir, commit_sha
-    )
-    
+    code_summary = analyze_code(workspace_dir, commit_sha)
+
     # Step 4: Generate tests using LLM agent
-    agent_tests_dir, prompt_used = gen_tests_agent(
-        workspace_dir, code_summary
-    )
-    
+    agent_tests_dir, prompt_used = gen_tests_agent(workspace_dir, code_summary)
+
     # Step 5: Generate baseline tests (optional)
     baseline_tests_dir = gen_tests_baseline(workspace_dir, code_summary)
-    
+
     # Step 6: Run agent tests
     agent_results = run_tests(workspace_dir, agent_tests_dir, label="agent")
-    
+
     # Step 7: Run baseline tests (if available)
-    baseline_results = run_tests(workspace_dir, baseline_tests_dir, label="baseline")
-    
+    baseline_results = run_tests(
+        workspace_dir, baseline_tests_dir, label="baseline"
+    )
+
     # Step 8: Generate comprehensive report (includes evaluation)
     report(
         workspace_dir,
@@ -60,4 +55,4 @@ def generate_and_evaluate() -> None:
         prompt_used,
         agent_results,
         baseline_results,
-    )
\ No newline at end of file
+    )
diff --git a/qualityflow/run.py b/qualityflow/run.py
index 11751350..c9ff7370 100644
--- a/qualityflow/run.py
+++ b/qualityflow/run.py
@@ -14,7 +14,7 @@
 @click.command()
 @click.option(
     "--config",
-    "-c", 
+    "-c",
     type=click.Path(exists=True, dir_okay=False),
     default=None,
     required=False,
@@ -28,7 +28,7 @@
 )
 def main(config: str | None, no_cache: bool):
     """Run QualityFlow test generation and coverage analysis pipeline.
-    
+
     Simple pipeline that generates tests using LLM, runs them, measures coverage,
     and compares results against baseline approaches.
     """
@@ -38,10 +38,11 @@ def main(config: str | None, no_cache: bool):
     chosen_config = config or str(default_config)
 
     try:
-        logger.info(f"Starting QualityFlow pipeline with config: {chosen_config}")
+        logger.info(
+            f"Starting QualityFlow pipeline with config: {chosen_config}"
+        )
         pipeline_instance = generate_and_evaluate.with_options(
-            config_path=chosen_config, 
-            enable_cache=not no_cache
+            config_path=chosen_config, enable_cache=not no_cache
         )
         pipeline_instance()
         logger.info("QualityFlow pipeline completed successfully!")
@@ -52,4 +53,4 @@ def main(config: str | None, no_cache: bool):
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/qualityflow/steps/__init__.py b/qualityflow/steps/__init__.py
index 70abca08..c6857ec7 100644
--- a/qualityflow/steps/__init__.py
+++ b/qualityflow/steps/__init__.py
@@ -1,21 +1,21 @@
 """QualityFlow pipeline steps."""
 
-from .select_input import select_input
-from .fetch_source import fetch_source
 from .analyze_code import analyze_code
+from .evaluate_coverage import evaluate_coverage
+from .fetch_source import fetch_source
 from .gen_tests_agent import gen_tests_agent
 from .gen_tests_baseline import gen_tests_baseline
-from .run_tests import run_tests
-from .evaluate_coverage import evaluate_coverage
 from .report import report
+from .run_tests import run_tests
+from .select_input import select_input
 
 __all__ = [
     "select_input",
     "fetch_source",
-    "analyze_code", 
+    "analyze_code",
     "gen_tests_agent",
     "gen_tests_baseline",
     "run_tests",
     "evaluate_coverage",
     "report",
-]
\ No newline at end of file
+]
diff --git a/qualityflow/steps/analyze_code.py b/qualityflow/steps/analyze_code.py
index 0bfebd9c..3a8f5a14 100644
--- a/qualityflow/steps/analyze_code.py
+++ b/qualityflow/steps/analyze_code.py
@@ -2,23 +2,25 @@
 Analyze and select code files for test generation.
 """
 
-import glob
 import ast
+import glob
 import os
+from enum import Enum
 from pathlib import Path
-from typing import Annotated, Dict, List, Tuple
+from typing import Annotated, Dict, List
 
 from zenml import step
 from zenml.logger import get_logger
-from enum import Enum
 
 
 class SelectionStrategy(str, Enum):
     """Code file selection strategies."""
+
     LOW_COVERAGE = "low_coverage"
     CHANGED_FILES = "changed_files"
     ALL = "all"
 
+
 logger = get_logger(__name__)
 
 
@@ -32,80 +34,85 @@ def analyze_code(
 ) -> Annotated[Dict, "code_summary"]:
     """
     Analyze workspace and select candidate files for test generation.
-    
+
     Args:
         workspace_dir: Path to workspace directory
         commit_sha: Git commit SHA
         target_glob: Glob pattern for target files
         strategy: File selection strategy
         max_files: Maximum number of files to select
-        
+
     Returns:
         Code summary dictionary containing selected files and metadata
     """
     logger.info(f"Analyzing code in {workspace_dir} with strategy {strategy}")
-    
+
     workspace_path = Path(workspace_dir)
-    
+
     # Find all Python files matching glob pattern
     all_files = []
     for pattern in target_glob.split(","):
         pattern = pattern.strip()
-        matched_files = glob.glob(str(workspace_path / pattern), recursive=True)
+        matched_files = glob.glob(
+            str(workspace_path / pattern), recursive=True
+        )
         all_files.extend(matched_files)
-    
+
     # Make paths relative to workspace
     relative_files = [
-        os.path.relpath(f, workspace_dir) 
-        for f in all_files 
-        if f.endswith('.py') and os.path.isfile(f)
+        os.path.relpath(f, workspace_dir)
+        for f in all_files
+        if f.endswith(".py") and os.path.isfile(f)
     ]
-    
+
     logger.info(f"Found {len(relative_files)} Python files")
-    
+
     # Calculate complexity scores
     complexity_scores = {}
     valid_files = []
-    
+
     for file_path in relative_files:
         full_path = workspace_path / file_path
         try:
-            with open(full_path, 'r', encoding='utf-8') as f:
+            with open(full_path, "r", encoding="utf-8") as f:
                 content = f.read()
-            
+
             # Parse AST and calculate basic complexity
             tree = ast.parse(content)
             complexity = _calculate_complexity(tree)
             complexity_scores[file_path] = complexity
             valid_files.append(file_path)
-            
+
         except (SyntaxError, UnicodeDecodeError) as e:
             logger.warning(f"Skipping {file_path} due to parsing error: {e}")
             continue
-    
+
     # Select files based on strategy
-    selected_files = _select_files(valid_files, complexity_scores, strategy, max_files)
-    
+    selected_files = _select_files(
+        valid_files, complexity_scores, strategy, max_files
+    )
+
     code_summary = {
         "selected_files": selected_files,
         "total_files": len(valid_files),
         "selection_reason": f"Selected top {len(selected_files)} files using {strategy} strategy",
-        "complexity_scores": {f: complexity_scores[f] for f in selected_files}
+        "complexity_scores": {f: complexity_scores[f] for f in selected_files},
     }
-    
+
     logger.info(f"Selected {len(selected_files)} files: {selected_files}")
-    
+
     return code_summary
 
 
 def _calculate_complexity(tree: ast.AST) -> float:
     """Calculate basic complexity score for an AST."""
+
     class ComplexityVisitor(ast.NodeVisitor):
         def __init__(self):
             self.complexity = 0
             self.functions = 0
             self.classes = 0
-            
+
         def visit_FunctionDef(self, node):
             self.functions += 1
             self.complexity += 1
@@ -113,39 +120,43 @@ def visit_FunctionDef(self, node):
                 if isinstance(child, (ast.If, ast.For, ast.While, ast.Try)):
                     self.complexity += 1
             self.generic_visit(node)
-            
+
         def visit_ClassDef(self, node):
             self.classes += 1
             self.complexity += 1
             self.generic_visit(node)
-    
+
     visitor = ComplexityVisitor()
     visitor.visit(tree)
-    
+
     # Combine metrics into single score
     return visitor.complexity + visitor.functions * 0.5 + visitor.classes * 2
 
 
 def _select_files(
-    files: List[str], 
-    complexity_scores: Dict[str, float], 
-    strategy: SelectionStrategy, 
-    max_files: int
+    files: List[str],
+    complexity_scores: Dict[str, float],
+    strategy: SelectionStrategy,
+    max_files: int,
 ) -> List[str]:
     """Select files based on strategy."""
-    
+
     if strategy == SelectionStrategy.ALL:
         return files[:max_files]
-    
+
     elif strategy == SelectionStrategy.LOW_COVERAGE:
         # Prioritize complex files that likely need more tests
-        sorted_files = sorted(files, key=lambda f: complexity_scores[f], reverse=True)
+        sorted_files = sorted(
+            files, key=lambda f: complexity_scores[f], reverse=True
+        )
         return sorted_files[:max_files]
-    
+
     elif strategy == SelectionStrategy.CHANGED_FILES:
         # For this demo, just return all files (in real implementation, would use git diff)
-        logger.warning("CHANGED_FILES strategy not fully implemented, falling back to ALL")
+        logger.warning(
+            "CHANGED_FILES strategy not fully implemented, falling back to ALL"
+        )
         return files[:max_files]
-    
+
     else:
-        raise ValueError(f"Unknown selection strategy: {strategy}")
\ No newline at end of file
+        raise ValueError(f"Unknown selection strategy: {strategy}")
diff --git a/qualityflow/steps/evaluate_coverage.py b/qualityflow/steps/evaluate_coverage.py
index 9d384b10..f069bfb5 100644
--- a/qualityflow/steps/evaluate_coverage.py
+++ b/qualityflow/steps/evaluate_coverage.py
@@ -3,7 +3,8 @@
 """
 
 from typing import Annotated, Dict, Optional
-from zenml import step, Model
+
+from zenml import step
 from zenml.logger import get_logger
 
 logger = get_logger(__name__)
@@ -17,39 +18,55 @@ def evaluate_coverage(
 ) -> Annotated[Dict, "evaluation_metrics"]:
     """
     Evaluate coverage metrics and compare agent vs baseline approaches.
-    
+
     Args:
         agent_results: Test results from agent-generated tests
         baseline_results: Test results from baseline tests (optional)
         commit_sha: Current commit SHA
-        
+
     Returns:
         Evaluation metrics dictionary with coverage comparison
     """
     logger.info("Evaluating coverage metrics and computing deltas")
-    
+
     # Extract agent metrics
     coverage_total_agent = agent_results.get("coverage_total", 0.0)
     tests_passed_agent = agent_results.get("tests_passed", 0)
     tests_failed_agent = agent_results.get("tests_failed", 0)
-    
+
     total_tests_agent = tests_passed_agent + tests_failed_agent
-    pass_rate_agent = tests_passed_agent / total_tests_agent if total_tests_agent > 0 else 0.0
-    
+    pass_rate_agent = (
+        tests_passed_agent / total_tests_agent
+        if total_tests_agent > 0
+        else 0.0
+    )
+
     # Extract baseline metrics
     coverage_total_baseline = None
     if baseline_results and not baseline_results.get("skipped", False):
         coverage_total_baseline = baseline_results.get("coverage_total", 0.0)
-    
+
     # Compare agent vs baseline coverage
     coverage_improvement = 0.0
     if coverage_total_baseline is not None:
         coverage_improvement = coverage_total_agent - coverage_total_baseline
-    
+
     # Analyze coverage quality
-    pass_rate_quality = "excellent" if pass_rate_agent > 0.95 else "good" if pass_rate_agent > 0.8 else "needs_improvement"
-    coverage_quality = "excellent" if coverage_total_agent > 80 else "good" if coverage_total_agent > 50 else "needs_improvement"
-    
+    pass_rate_quality = (
+        "excellent"
+        if pass_rate_agent > 0.95
+        else "good"
+        if pass_rate_agent > 0.8
+        else "needs_improvement"
+    )
+    coverage_quality = (
+        "excellent"
+        if coverage_total_agent > 80
+        else "good"
+        if coverage_total_agent > 50
+        else "needs_improvement"
+    )
+
     evaluation_metrics = {
         "coverage_total_agent": coverage_total_agent,
         "coverage_total_baseline": coverage_total_baseline,
@@ -62,7 +79,9 @@ def evaluate_coverage(
         "commit_sha": commit_sha,
         "files_analyzed": len(agent_results.get("coverage_by_file", {})),
     }
-    
-    logger.info(f"Evaluation complete: agent_coverage={coverage_total_agent:.2f}%, baseline_coverage={coverage_total_baseline or 0:.2f}%, improvement={coverage_improvement:+.2f}%")
-    
-    return evaluation_metrics
\ No newline at end of file
+
+    logger.info(
+        f"Evaluation complete: agent_coverage={coverage_total_agent:.2f}%, baseline_coverage={coverage_total_baseline or 0:.2f}%, improvement={coverage_improvement:+.2f}%"
+    )
+
+    return evaluation_metrics
diff --git a/qualityflow/steps/fetch_source.py b/qualityflow/steps/fetch_source.py
index c117f2d2..dfbfd609 100644
--- a/qualityflow/steps/fetch_source.py
+++ b/qualityflow/steps/fetch_source.py
@@ -2,8 +2,8 @@
 Fetch source code workspace step.
 """
 
-import tempfile
 import subprocess
+import tempfile
 from pathlib import Path
 from typing import Annotated, Dict, Tuple
 
@@ -19,32 +19,41 @@ def fetch_source(
 ) -> Tuple[Annotated[Path, "workspace_dir"], Annotated[str, "commit_sha"]]:
     """
     Fetch and materialize workspace from git repository.
-    
+
     Args:
         source_spec: Source specification from select_input step
-        
+
     Returns:
         Tuple of workspace directory path and commit SHA
     """
     repo_url = source_spec["repo_url"]
     ref = source_spec["ref"]
-    
+
     logger.info(f"Fetching source from {repo_url}@{ref}")
-    
+
     # Create temporary workspace
     workspace_dir = tempfile.mkdtemp(prefix="qualityflow_workspace_")
     workspace_path = Path(workspace_dir)
-    
+
     try:
         # Clone repository
         logger.info(f"Cloning {repo_url} to {workspace_dir}")
         subprocess.run(
-            ["git", "clone", "--depth", "1", "--branch", ref, repo_url, workspace_dir],
+            [
+                "git",
+                "clone",
+                "--depth",
+                "1",
+                "--branch",
+                ref,
+                repo_url,
+                workspace_dir,
+            ],
             check=True,
             capture_output=True,
             text=True,
         )
-        
+
         # Get commit SHA
         result = subprocess.run(
             ["git", "rev-parse", "HEAD"],
@@ -54,11 +63,13 @@ def fetch_source(
             text=True,
         )
         commit_sha = result.stdout.strip()
-        
-        logger.info(f"Workspace ready at {workspace_dir}, commit: {commit_sha}")
-        
+
+        logger.info(
+            f"Workspace ready at {workspace_dir}, commit: {commit_sha}"
+        )
+
         return Path(workspace_dir), commit_sha
-        
+
     except subprocess.CalledProcessError as e:
         logger.error(f"Failed to fetch source: {e}")
         raise RuntimeError(f"Git operation failed: {e.stderr}")
@@ -66,5 +77,6 @@ def fetch_source(
         logger.error(f"Unexpected error fetching source: {e}")
         # Clean up on error
         import shutil
+
         shutil.rmtree(workspace_dir, ignore_errors=True)
-        raise
\ No newline at end of file
+        raise
diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py
index 8ed37d31..9a918afc 100644
--- a/qualityflow/steps/gen_tests_agent.py
+++ b/qualityflow/steps/gen_tests_agent.py
@@ -3,22 +3,23 @@
 """
 
 import tempfile
+from enum import Enum
 from pathlib import Path
-from typing import Annotated, Dict, List, Tuple
-from jinja2 import Template
+from typing import Annotated, Dict, Tuple
 
-from zenml import step
+from jinja2 import Template
+from zenml import log_metadata, step
 from zenml.logger import get_logger
-from zenml import log_metadata
-from enum import Enum
 
 
 class GenerationProvider(str, Enum):
     """LLM providers for test generation."""
+
     OPENAI = "openai"
     ANTHROPIC = "anthropic"
     FAKE = "fake"
 
+
 logger = get_logger(__name__)
 
 
@@ -31,12 +32,9 @@ def gen_tests_agent(
     prompt_path: str = "prompts/unit_test_v1.jinja",
     max_tests_per_file: int = 3,
     max_files: int = 10,
-) -> Tuple[
-    Annotated[Path, "agent_tests_dir"], 
-    Annotated[str, "prompt_used"]
-]:
+) -> Tuple[Annotated[Path, "agent_tests_dir"], Annotated[str, "prompt_used"]]:
     """Generate tests using LLM agent.
-    
+
     Args:
         workspace_dir: Path to workspace directory
         code_summary: Code analysis summary containing selected files
@@ -45,86 +43,100 @@ def gen_tests_agent(
         prompt_path: Path to Jinja2 prompt template
         max_tests_per_file: Maximum tests to generate per file
         max_files: Maximum number of files to process (for speed control)
-        
+
     Returns:
         Tuple of test directory and prompt used
     """
     # Extract selected files from code summary
     selected_files = code_summary.get("selected_files", [])
-    
+
     # Limit files if max_files is specified
-    files_to_process = selected_files[:max_files] if max_files > 0 else selected_files
-    logger.info(f"Generating tests for {len(files_to_process)}/{len(selected_files)} files using {provider}:{model}")
-    
+    files_to_process = (
+        selected_files[:max_files] if max_files > 0 else selected_files
+    )
+    logger.info(
+        f"Generating tests for {len(files_to_process)}/{len(selected_files)} files using {provider}:{model}"
+    )
+
     # Create tests directory
     tests_dir = tempfile.mkdtemp(prefix="qualityflow_agent_tests_")
     tests_path = Path(tests_dir)
-    
+
     # Load prompt template
     workspace_path = Path(workspace_dir)
     prompt_file = workspace_path / prompt_path
-    
+
     if prompt_file.exists():
-        with open(prompt_file, 'r') as f:
+        with open(prompt_file, "r") as f:
             prompt_template = f.read()
     else:
         # Use default template if file doesn't exist
         prompt_template = _get_default_prompt_template()
         logger.info(f"Using default prompt template, {prompt_path} not found")
-    
+
     template = Template(prompt_template)
-    
+
     total_tokens_in = 0
     total_tokens_out = 0
     materialized_prompts = {}  # Store materialized prompts per file
-    
+
     for file_path in files_to_process:
         logger.info(f"Generating tests for {file_path}")
-        
+
         # Read source file
         full_file_path = workspace_path / file_path
-        with open(full_file_path, 'r') as f:
+        with open(full_file_path, "r") as f:
             source_code = f.read()
-        
+
         # Render prompt
         materialized_prompt = template.render(
             file_path=file_path,
             source_code=source_code,
             max_tests=max_tests_per_file,
-            complexity_score=code_summary.get("complexity_scores", {}).get(file_path, 0)
+            complexity_score=code_summary.get("complexity_scores", {}).get(
+                file_path, 0
+            ),
         )
-        
+
         # Store the materialized prompt for this file
         materialized_prompts[file_path] = materialized_prompt
-        
+
         # Generate tests using provider
         if provider == GenerationProvider.FAKE:
-            generated_tests, tokens = _generate_fake_tests(file_path, source_code, max_tests_per_file)
+            generated_tests, tokens = _generate_fake_tests(
+                file_path, source_code, max_tests_per_file
+            )
         elif provider == GenerationProvider.OPENAI:
-            generated_tests, tokens = _generate_openai_tests(materialized_prompt, model)
+            generated_tests, tokens = _generate_openai_tests(
+                materialized_prompt, model
+            )
         elif provider == GenerationProvider.ANTHROPIC:
-            generated_tests, tokens = _generate_anthropic_tests(materialized_prompt, model)
+            generated_tests, tokens = _generate_anthropic_tests(
+                materialized_prompt, model
+            )
         else:
             raise ValueError(f"Unsupported provider: {provider}")
-        
+
         total_tokens_in += tokens.get("tokens_in", 0)
         total_tokens_out += tokens.get("tokens_out", 0)
-        
+
         # Save generated tests
         test_file_name = f"test_{Path(file_path).stem}.py"
         test_file_path = tests_path / test_file_name
-        
-        with open(test_file_path, 'w') as f:
+
+        with open(test_file_path, "w") as f:
             f.write(generated_tests)
-        
+
         logger.info(f"Generated tests saved to {test_file_path}")
-    
+
     # Log comprehensive metadata including materialized prompts
     metadata = {
         "token_usage": {
             "tokens_in": total_tokens_in,
             "tokens_out": total_tokens_out,
-            "cost_estimate": _estimate_cost(total_tokens_in, total_tokens_out, provider, model),
+            "cost_estimate": _estimate_cost(
+                total_tokens_in, total_tokens_out, provider, model
+            ),
         },
         "config": {
             "provider": provider.value,
@@ -136,13 +148,15 @@ def gen_tests_agent(
         "materialized_prompts": materialized_prompts,
         "prompt_template": prompt_template,
     }
-    
+
     log_metadata(metadata)
-    logger.info(f"Test generation complete. Files: {len(files_to_process)}, Tokens: {total_tokens_in} in / {total_tokens_out} out")
-    
+    logger.info(
+        f"Test generation complete. Files: {len(files_to_process)}, Tokens: {total_tokens_in} in / {total_tokens_out} out"
+    )
+
     # Create a better prompt summary for the report
     prompt_summary = f"Template: {prompt_path}\nProvider: {provider.value}\nModel: {model}\nFiles processed: {len(files_to_process)}"
-    
+
     # Return Path object - ZenML will automatically materialize the folder
     return Path(tests_dir), prompt_summary
 
@@ -168,11 +182,13 @@ def _get_default_prompt_template() -> str:
 """
 
 
-def _generate_fake_tests(file_path: str, source_code: str, max_tests: int) -> Tuple[str, Dict]:
+def _generate_fake_tests(
+    file_path: str, source_code: str, max_tests: int
+) -> Tuple[str, Dict]:
     """Generate fake/mock tests for development/testing."""
     # Create a simple module name from file path
-    module_name = file_path.replace('/', '.').replace('.py', '')
-    
+    module_name = file_path.replace("/", ".").replace(".py", "")
+
     test_content = f'''"""
 Generated tests for {file_path}
 """
@@ -181,7 +197,7 @@ def _generate_fake_tests(file_path: str, source_code: str, max_tests: int) -> Tu
 import unittest
 from unittest.mock import Mock, patch, MagicMock
 
-class Test{file_path.split('/')[-1].replace('.py', '').title()}(unittest.TestCase):
+class Test{file_path.split("/")[-1].replace(".py", "").title()}(unittest.TestCase):
     """Auto-generated test class for {file_path}."""
     
     def test_module_import(self):
@@ -222,7 +238,7 @@ def test_coverage_target(self):
 if __name__ == "__main__":
     unittest.main()
 '''
-    
+
     tokens = {"tokens_in": 100, "tokens_out": 50}
     return test_content, tokens
 
@@ -230,31 +246,35 @@ def test_coverage_target(self):
 def _generate_openai_tests(prompt: str, model: str) -> Tuple[str, Dict]:
     """Generate tests using OpenAI API."""
     try:
-        import openai
         import os
-        
+
+        import openai
+
         # Get API key from environment
         api_key = os.getenv("OPENAI_API_KEY")
         if not api_key:
             logger.warning("OPENAI_API_KEY not found, using fake tests")
             return _generate_fake_tests("openai_file", "mock_code", 3)
-        
+
         client = openai.OpenAI(api_key=api_key)
-        
+
         # Call OpenAI API
         response = client.chat.completions.create(
             model=model,
             messages=[
-                {"role": "system", "content": "You are a Python test generation expert. Generate comprehensive unit tests for the given code."},
-                {"role": "user", "content": prompt}
+                {
+                    "role": "system",
+                    "content": "You are a Python test generation expert. Generate comprehensive unit tests for the given code.",
+                },
+                {"role": "user", "content": prompt},
             ],
             max_tokens=2000,
-            temperature=0.1
+            temperature=0.1,
         )
-        
+
         # Extract test code from response
         generated_content = response.choices[0].message.content
-        
+
         # Try to extract Python code blocks
         if "```python" in generated_content:
             start = generated_content.find("```python") + 9
@@ -267,16 +287,18 @@ def _generate_openai_tests(prompt: str, model: str) -> Tuple[str, Dict]:
         else:
             # Use the whole response if no code blocks found
             test_content = generated_content.strip()
-        
+
         # Token usage for cost estimation
         tokens = {
             "tokens_in": response.usage.prompt_tokens,
-            "tokens_out": response.usage.completion_tokens
+            "tokens_out": response.usage.completion_tokens,
         }
-        
-        logger.info(f"Generated tests using OpenAI {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out")
+
+        logger.info(
+            f"Generated tests using OpenAI {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out"
+        )
         return test_content, tokens
-        
+
     except ImportError:
         logger.warning("OpenAI library not installed, using fake tests")
         return _generate_fake_tests("openai_file", "mock_code", 3)
@@ -289,30 +311,34 @@ def _generate_openai_tests(prompt: str, model: str) -> Tuple[str, Dict]:
 def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]:
     """Generate tests using Anthropic API."""
     try:
-        import anthropic
         import os
-        
+
+        import anthropic
+
         # Get API key from environment
         api_key = os.getenv("ANTHROPIC_API_KEY")
         if not api_key:
             logger.warning("ANTHROPIC_API_KEY not found, using fake tests")
             return _generate_fake_tests("anthropic_file", "mock_code", 3)
-        
+
         client = anthropic.Anthropic(api_key=api_key)
-        
+
         # Call Anthropic API
         response = client.messages.create(
             model=model,
             max_tokens=2000,
             temperature=0.1,
             messages=[
-                {"role": "user", "content": f"You are a Python test generation expert. Generate comprehensive unit tests for the given code.\n\n{prompt}"}
-            ]
+                {
+                    "role": "user",
+                    "content": f"You are a Python test generation expert. Generate comprehensive unit tests for the given code.\n\n{prompt}",
+                }
+            ],
         )
-        
+
         # Extract test content from response
         generated_content = response.content[0].text
-        
+
         # Try to extract Python code blocks
         if "```python" in generated_content:
             start = generated_content.find("```python") + 9
@@ -325,16 +351,18 @@ def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]:
         else:
             # Use the whole response if no code blocks found
             test_content = generated_content.strip()
-        
+
         # Token usage for cost estimation
         tokens = {
             "tokens_in": response.usage.input_tokens,
-            "tokens_out": response.usage.output_tokens
+            "tokens_out": response.usage.output_tokens,
         }
-        
-        logger.info(f"Generated tests using Anthropic {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out")
+
+        logger.info(
+            f"Generated tests using Anthropic {model}: {tokens['tokens_in']} in, {tokens['tokens_out']} out"
+        )
         return test_content, tokens
-        
+
     except ImportError:
         logger.warning("Anthropic library not installed, using fake tests")
         return _generate_fake_tests("anthropic_file", "mock_code", 3)
@@ -344,7 +372,9 @@ def _generate_anthropic_tests(prompt: str, model: str) -> Tuple[str, Dict]:
         return _generate_fake_tests("anthropic_file", "mock_code", 3)
 
 
-def _estimate_cost(tokens_in: int, tokens_out: int, provider: GenerationProvider, model: str) -> float:
+def _estimate_cost(
+    tokens_in: int, tokens_out: int, provider: GenerationProvider, model: str
+) -> float:
     """Estimate cost based on token usage."""
     # Rough cost estimates (would need real pricing)
     if provider == GenerationProvider.OPENAI:
@@ -355,4 +385,4 @@ def _estimate_cost(tokens_in: int, tokens_out: int, provider: GenerationProvider
     elif provider == GenerationProvider.ANTHROPIC:
         return (tokens_in * 0.000008) + (tokens_out * 0.000024)
     else:
-        return 0.0
\ No newline at end of file
+        return 0.0
diff --git a/qualityflow/steps/gen_tests_baseline.py b/qualityflow/steps/gen_tests_baseline.py
index 68a0a4e4..a3197748 100644
--- a/qualityflow/steps/gen_tests_baseline.py
+++ b/qualityflow/steps/gen_tests_baseline.py
@@ -2,8 +2,8 @@
 Generate baseline/skeleton tests using heuristics.
 """
 
-import tempfile
 import ast
+import tempfile
 from pathlib import Path
 from typing import Annotated, Dict, List, Optional
 
@@ -22,65 +22,71 @@ def gen_tests_baseline(
 ) -> Annotated[Optional[Path], "baseline_tests_dir"]:
     """
     Generate baseline/skeleton tests using heuristic analysis.
-    
+
     Args:
         workspace_dir: Path to workspace directory
         code_summary: Code analysis summary containing selected files
         enabled: Whether baseline generation is enabled
         max_files: Maximum number of files to process
-        
+
     Returns:
         Path to baseline tests directory, or None if disabled
     """
     if not enabled:
         logger.info("Baseline test generation disabled")
         return None
-        
+
     # Extract selected files from code summary
     selected_files = code_summary.get("selected_files", [])
-    
+
     # Limit files if max_files is specified
-    files_to_process = selected_files[:max_files] if max_files > 0 else selected_files
-    logger.info(f"Generating baseline tests for {len(files_to_process)}/{len(selected_files)} files")
-    
+    files_to_process = (
+        selected_files[:max_files] if max_files > 0 else selected_files
+    )
+    logger.info(
+        f"Generating baseline tests for {len(files_to_process)}/{len(selected_files)} files"
+    )
+
     # Create baseline tests directory
     tests_dir = tempfile.mkdtemp(prefix="qualityflow_baseline_tests_")
     tests_path = Path(tests_dir)
-    
+
     workspace_path = Path(workspace_dir)
-    
+
     for file_path in files_to_process:
         logger.info(f"Generating baseline tests for {file_path}")
-        
+
         # Read and parse source file
         full_file_path = workspace_path / file_path
-        with open(full_file_path, 'r') as f:
+        with open(full_file_path, "r") as f:
             source_code = f.read()
-        
+
         try:
             tree = ast.parse(source_code)
-            
+
             # Extract functions and classes
             functions, classes = _extract_testable_items(tree)
-            
+
             # Generate skeleton tests
-            test_content = _generate_skeleton_tests(file_path, functions, classes)
-            
+            test_content = _generate_skeleton_tests(
+                file_path, functions, classes
+            )
+
             # Save baseline tests
             test_file_name = f"test_{Path(file_path).stem}_baseline.py"
             test_file_path = tests_path / test_file_name
-            
-            with open(test_file_path, 'w') as f:
+
+            with open(test_file_path, "w") as f:
                 f.write(test_content)
-            
+
             logger.info(f"Baseline tests saved to {test_file_path}")
-            
+
         except SyntaxError as e:
             logger.warning(f"Skipping {file_path} due to syntax error: {e}")
             continue
-    
+
     logger.info("Baseline test generation complete")
-    
+
     # Return Path object - ZenML will automatically materialize the folder
     return Path(tests_dir)
 
@@ -89,23 +95,25 @@ def _extract_testable_items(tree: ast.AST) -> tuple[List[str], List[str]]:
     """Extract function and class names from AST."""
     functions = []
     classes = []
-    
+
     for node in ast.walk(tree):
         if isinstance(node, ast.FunctionDef):
             # Skip private functions (starting with _)
-            if not node.name.startswith('_'):
+            if not node.name.startswith("_"):
                 functions.append(node.name)
         elif isinstance(node, ast.ClassDef):
             # Skip private classes
-            if not node.name.startswith('_'):
+            if not node.name.startswith("_"):
                 classes.append(node.name)
-    
+
     return functions, classes
 
 
-def _generate_skeleton_tests(file_path: str, functions: List[str], classes: List[str]) -> str:
+def _generate_skeleton_tests(
+    file_path: str, functions: List[str], classes: List[str]
+) -> str:
     """Generate skeleton test content."""
-    
+
     # Create imports section
     imports = f'''"""
 Baseline/skeleton tests for {file_path}
@@ -116,17 +124,19 @@ def _generate_skeleton_tests(file_path: str, functions: List[str], classes: List
 import unittest
 from unittest.mock import Mock, patch
 '''
-    
+
     # Try to determine import path from file path
-    module_path = file_path.replace('/', '.').replace('.py', '')
-    if module_path.startswith('src.'):
+    module_path = file_path.replace("/", ".").replace(".py", "")
+    if module_path.startswith("src."):
         module_path = module_path[4:]  # Remove 'src.' prefix
-    
+
     if functions or classes:
-        imports += f"# from {module_path} import {', '.join(functions + classes)}\n\n"
+        imports += (
+            f"# from {module_path} import {', '.join(functions + classes)}\n\n"
+        )
     else:
         imports += f"# from {module_path} import *\n\n"
-    
+
     # Generate function tests
     function_tests = ""
     for func_name in functions:
@@ -141,7 +151,7 @@ def test_{func_name}_error_cases():
     # TODO: Test error conditions for {func_name}
     pass
 '''
-    
+
     # Generate class tests
     class_tests = ""
     for class_name in classes:
@@ -164,7 +174,7 @@ def test_{class_name.lower()}_methods(self):
         # TODO: Test class methods
         pass
 '''
-    
+
     # Add default test if no functions or classes found
     if not functions and not classes:
         default_test = '''
@@ -177,14 +187,14 @@ def test_module_imports(self):
         pass
 '''
         class_tests += default_test
-    
+
     # Combine all parts
     test_content = imports + function_tests + class_tests
-    
+
     # Add main block
-    test_content += '''
+    test_content += """
 if __name__ == "__main__":
     unittest.main()
-'''
-    
-    return test_content
\ No newline at end of file
+"""
+
+    return test_content
diff --git a/qualityflow/steps/report.py b/qualityflow/steps/report.py
index 141ecda2..628d5d07 100644
--- a/qualityflow/steps/report.py
+++ b/qualityflow/steps/report.py
@@ -3,9 +3,9 @@
 """
 
 import tempfile
+from datetime import datetime
 from pathlib import Path
 from typing import Annotated, Dict, Optional
-from datetime import datetime
 
 from zenml import step
 from zenml.logger import get_logger
@@ -24,25 +24,29 @@ def report(
 ) -> Annotated[MarkdownString, "final_report"]:
     """
     Generate comprehensive markdown report for pipeline execution.
-    
+
     Args:
         workspace_dir: Workspace directory path
         commit_sha: Git commit SHA
         prompt_used: Prompt template used
         agent_results: Agent test results
         baseline_results: Baseline test results (optional)
-        
+
     Returns:
         Path to generated markdown report
     """
     logger.info("Generating pipeline execution report")
-    
+
     # Create report file
-    report_file = Path(tempfile.mkdtemp(prefix="qualityflow_report_")) / "report.md"
-    
+    report_file = (
+        Path(tempfile.mkdtemp(prefix="qualityflow_report_")) / "report.md"
+    )
+
     # Evaluate coverage metrics first
-    evaluation_metrics = _evaluate_coverage_metrics(agent_results, baseline_results, commit_sha)
-    
+    evaluation_metrics = _evaluate_coverage_metrics(
+        agent_results, baseline_results, commit_sha
+    )
+
     # Generate report content
     report_content = _generate_report_content(
         workspace_dir,
@@ -52,13 +56,13 @@ def report(
         baseline_results,
         evaluation_metrics,
     )
-    
+
     # Write report file
-    with open(report_file, 'w') as f:
+    with open(report_file, "w") as f:
         f.write(report_content)
-    
+
     logger.info(f"Report generated: {report_file}")
-    
+
     # Return as MarkdownString for dashboard visualization
     return MarkdownString(report_content)
 
@@ -69,27 +73,43 @@ def _evaluate_coverage_metrics(
     commit_sha: str,
 ) -> Dict:
     """Evaluate coverage metrics and compare agent vs baseline approaches."""
-    
+
     # Extract agent metrics - use actual values from test results
     coverage_total_agent = agent_results.get("coverage_total", 0.0)
     tests_passed_agent = agent_results.get("tests_passed", 0)
     tests_failed_agent = agent_results.get("tests_failed", 0)
-    
+
     total_tests_agent = tests_passed_agent + tests_failed_agent
-    pass_rate_agent = tests_passed_agent / total_tests_agent if total_tests_agent > 0 else 0.0
-    
+    pass_rate_agent = (
+        tests_passed_agent / total_tests_agent
+        if total_tests_agent > 0
+        else 0.0
+    )
+
     # Extract baseline metrics
     coverage_total_baseline = 0.0
     if baseline_results and not baseline_results.get("skipped", False):
         coverage_total_baseline = baseline_results.get("coverage_total", 0.0)
-    
+
     # Compare agent vs baseline coverage
     coverage_improvement = coverage_total_agent - coverage_total_baseline
-    
+
     # Analyze coverage quality
-    pass_rate_quality = "excellent" if pass_rate_agent > 0.95 else "good" if pass_rate_agent > 0.8 else "needs_improvement"
-    coverage_quality = "excellent" if coverage_total_agent > 80 else "good" if coverage_total_agent > 50 else "needs_improvement"
-    
+    pass_rate_quality = (
+        "excellent"
+        if pass_rate_agent > 0.95
+        else "good"
+        if pass_rate_agent > 0.8
+        else "needs_improvement"
+    )
+    coverage_quality = (
+        "excellent"
+        if coverage_total_agent > 80
+        else "good"
+        if coverage_total_agent > 50
+        else "needs_improvement"
+    )
+
     return {
         "coverage_total_agent": coverage_total_agent,
         "coverage_total_baseline": coverage_total_baseline,
@@ -113,9 +133,9 @@ def _generate_report_content(
     evaluation_metrics: Dict,
 ) -> str:
     """Generate markdown report content."""
-    
+
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    
+
     # Header
     report = f"""# QualityFlow Pipeline Report
 
@@ -126,70 +146,74 @@ def _generate_report_content(
 ## Executive Summary
 
 """
-    
+
     # Executive summary
     coverage_agent = evaluation_metrics.get("coverage_total_agent", 0.0)
     coverage_baseline = evaluation_metrics.get("coverage_total_baseline", 0.0)
     improvement = evaluation_metrics.get("coverage_improvement", 0.0)
     quality = evaluation_metrics.get("coverage_quality", "unknown")
-    
-    quality_emoji = "🟢" if quality == "excellent" else "🟡" if quality == "good" else "🔴"
-    improvement_emoji = "📈" if improvement > 0 else "📉" if improvement < 0 else "➡️"
-    
+
+    quality_emoji = (
+        "🟢" if quality == "excellent" else "🟡" if quality == "good" else "🔴"
+    )
+    improvement_emoji = (
+        "📈" if improvement > 0 else "📉" if improvement < 0 else "➡️"
+    )
+
     report += f"""{quality_emoji} **Coverage Quality**: {quality.upper()}
 {improvement_emoji} **Agent vs Baseline**: {coverage_agent:.2f}% vs {coverage_baseline:.2f}% ({improvement:+.2f}%)  
-🧪 **Tests**: {agent_results.get('tests_passed', 0)} passed, {agent_results.get('tests_failed', 0)} failed
-📁 **Files**: {evaluation_metrics.get('files_analyzed', 0)} analyzed
+🧪 **Tests**: {agent_results.get("tests_passed", 0)} passed, {agent_results.get("tests_failed", 0)} failed
+📁 **Files**: {evaluation_metrics.get("files_analyzed", 0)} analyzed
 
 """
-    
+
     # Agent results section
     report += """## Agent Test Results
 
 """
-    
+
     if agent_results.get("skipped", False):
         report += "Agent tests were skipped.\n\n"
     else:
-        report += f"""- **Tests Passed**: {agent_results.get('tests_passed', 0)}
-- **Tests Failed**: {agent_results.get('tests_failed', 0)}
-- **Pass Rate**: {evaluation_metrics.get('pass_rate_agent', 0.0):.1%}
-- **Coverage**: {agent_results.get('coverage_total', 0.0):.2f}%
-- **JUnit Report**: `{agent_results.get('junit_path', 'N/A')}`
-- **Coverage Report**: `{agent_results.get('coverage_path', 'N/A')}`
-- **Logs**: `{agent_results.get('logs_path', 'N/A')}`
+        report += f"""- **Tests Passed**: {agent_results.get("tests_passed", 0)}
+- **Tests Failed**: {agent_results.get("tests_failed", 0)}
+- **Pass Rate**: {evaluation_metrics.get("pass_rate_agent", 0.0):.1%}
+- **Coverage**: {agent_results.get("coverage_total", 0.0):.2f}%
+- **JUnit Report**: `{agent_results.get("junit_path", "N/A")}`
+- **Coverage Report**: `{agent_results.get("coverage_path", "N/A")}`
+- **Logs**: `{agent_results.get("logs_path", "N/A")}`
 
 """
-    
+
     # Baseline results section (if available)
     if baseline_results and not baseline_results.get("skipped", False):
         report += """## Baseline Test Results
 
 """
-        report += f"""- **Tests Passed**: {baseline_results.get('tests_passed', 0)}
-- **Tests Failed**: {baseline_results.get('tests_failed', 0)}
-- **Coverage**: {baseline_results.get('coverage_total', 0.0):.2f}%
-- **JUnit Report**: `{baseline_results.get('junit_path', 'N/A')}`
-- **Coverage Report**: `{baseline_results.get('coverage_path', 'N/A')}`
+        report += f"""- **Tests Passed**: {baseline_results.get("tests_passed", 0)}
+- **Tests Failed**: {baseline_results.get("tests_failed", 0)}
+- **Coverage**: {baseline_results.get("coverage_total", 0.0):.2f}%
+- **JUnit Report**: `{baseline_results.get("junit_path", "N/A")}`
+- **Coverage Report**: `{baseline_results.get("coverage_path", "N/A")}`
 
 """
-    
+
     # Evaluation metrics section
     report += """## Coverage Analysis
 
 """
-    
+
     pass_rate = evaluation_metrics.get("pass_rate_agent", 0.0)
     pass_quality = evaluation_metrics.get("pass_rate_quality", "unknown")
-    
+
     report += f"""- **Agent Coverage**: {coverage_agent:.2f}% ({quality})
 - **Baseline Coverage**: {coverage_baseline:.2f}%
 - **Improvement**: {improvement:+.2f}%
 - **Test Pass Rate**: {pass_rate:.1%} ({pass_quality})
-- **Files Analyzed**: {evaluation_metrics.get('files_analyzed', 0)}
+- **Files Analyzed**: {evaluation_metrics.get("files_analyzed", 0)}
 
 """
-    
+
     # Recommendations section
     report += """## Recommendations
 
@@ -200,14 +224,14 @@ def _generate_report_content(
         report += "👍 **Good coverage.** Consider tweaking prompts or selection strategy for improvement.\n"
     else:
         report += "⚠️ **Coverage needs improvement.** Try different prompts, models, or increase max_tests_per_file.\n"
-    
+
     if improvement > 5:
         report += "📈 **Agent significantly outperforms baseline** - LLM approach is working well.\n"
     elif improvement > 0:
         report += "📊 **Agent slightly better than baseline** - room for optimization.\n"
     else:
         report += "📉 **Baseline performs as well or better** - review agent configuration.\n"
-    
+
     # Configuration section
     report += """## Configuration
 
@@ -220,7 +244,7 @@ def _generate_report_content(
 
 ### File Coverage Details
 """
-    
+
     coverage_by_file = agent_results.get("coverage_by_file", {})
     if coverage_by_file:
         report += "| File | Coverage |\n|------|----------|\n"
@@ -228,11 +252,11 @@ def _generate_report_content(
             report += f"| `{file_path}` | {coverage_pct:.1f}% |\n"
     else:
         report += "No file-level coverage data available.\n"
-    
+
     report += """
 
 ---
 *Generated by QualityFlow - Production-ready test generation with ZenML*
 """
-    
-    return report
\ No newline at end of file
+
+    return report
diff --git a/qualityflow/steps/run_tests.py b/qualityflow/steps/run_tests.py
index a4d12385..4c8697f3 100644
--- a/qualityflow/steps/run_tests.py
+++ b/qualityflow/steps/run_tests.py
@@ -2,9 +2,9 @@
 Run tests and collect coverage metrics.
 """
 
+import shutil
 import subprocess
 import tempfile
-import shutil
 from pathlib import Path
 from typing import Annotated, Dict, Optional
 
@@ -21,12 +21,12 @@ def run_tests(
     label: str = "tests",
 ) -> Annotated[Dict, "test_results"]:
     """Run tests and collect coverage metrics.
-    
+
     Args:
         workspace_dir: Path to workspace directory
         tests_dir: Path object to tests directory (None if no tests)
         label: Label for this test run
-        
+
     Returns:
         Dictionary containing test results and metrics
     """
@@ -43,46 +43,54 @@ def run_tests(
             "logs_path": None,
             "skipped": True,
         }
-    
+
     logger.info(f"Running {label} tests from {tests_dir}")
-    
+
     # Create output directory for this test run
     output_dir = tempfile.mkdtemp(prefix=f"qualityflow_{label}_results_")
     output_path = Path(output_dir)
-    
+
     junit_file = output_path / "junit.xml"
     coverage_file = output_path / "coverage.xml"
     logs_file = output_path / "test_logs.txt"
-    
+
     # Copy tests to workspace (pytest needs them in PYTHONPATH)
     workspace_tests_dir = Path(workspace_dir) / f"tests_{label}"
     if workspace_tests_dir.exists():
         shutil.rmtree(workspace_tests_dir)
     shutil.copytree(tests_dir, workspace_tests_dir)
-    
+
     try:
         # Run pytest with coverage
         pytest_cmd = [
-            "python", "-m", "pytest",
+            "python",
+            "-m",
+            "pytest",
             str(workspace_tests_dir),
-            "--junitxml", str(junit_file),
-            "--cov", str(workspace_dir),
-            "--cov-report", f"xml:{coverage_file}",
-            "--cov-report", "term",
-            "-v"
+            "--junitxml",
+            str(junit_file),
+            "--cov",
+            str(workspace_dir),
+            "--cov-report",
+            f"xml:{coverage_file}",
+            "--cov-report",
+            "term",
+            "-v",
         ]
-        
+
         logger.info(f"Running command: {' '.join(pytest_cmd)}")
         logger.info(f"Working directory: {workspace_dir}")
         logger.info(f"Test directory: {workspace_tests_dir}")
-        
+
         # Debug: list test files
         if workspace_tests_dir.exists():
             test_files = list(workspace_tests_dir.glob("*.py"))
             logger.info(f"Test files found: {[f.name for f in test_files]}")
         else:
-            logger.warning(f"Test directory does not exist: {workspace_tests_dir}")
-        
+            logger.warning(
+                f"Test directory does not exist: {workspace_tests_dir}"
+            )
+
         result = subprocess.run(
             pytest_cmd,
             cwd=str(workspace_dir),
@@ -90,32 +98,34 @@ def run_tests(
             text=True,
             timeout=300,  # 5 minute timeout
         )
-        
+
         # Save logs and also log to console for debugging
-        with open(logs_file, 'w') as f:
+        with open(logs_file, "w") as f:
             f.write(f"Command: {' '.join(pytest_cmd)}\n")
             f.write(f"Return code: {result.returncode}\n\n")
             f.write("STDOUT:\n")
             f.write(result.stdout)
-            f.write("\nSTDERR:\n") 
+            f.write("\nSTDERR:\n")
             f.write(result.stderr)
-        
+
         # Also log the pytest output for debugging
         logger.info(f"Pytest return code: {result.returncode}")
         if result.stdout:
             logger.info(f"Pytest stdout: {result.stdout}")
         if result.stderr:
             logger.info(f"Pytest stderr: {result.stderr}")
-        
+
         # Parse results
         test_results = _parse_test_results(
             result, junit_file, coverage_file, logs_file, label
         )
-        
-        logger.info(f"Test run complete for {label}: {test_results['tests_passed']} passed, {test_results['tests_failed']} failed, {test_results['coverage_total']:.2f}% coverage")
-        
+
+        logger.info(
+            f"Test run complete for {label}: {test_results['tests_passed']} passed, {test_results['tests_failed']} failed, {test_results['coverage_total']:.2f}% coverage"
+        )
+
         return test_results
-        
+
     except subprocess.TimeoutExpired:
         logger.error(f"Test run for {label} timed out after 5 minutes")
         return {
@@ -125,11 +135,13 @@ def run_tests(
             "coverage_total": 0.0,
             "coverage_by_file": {},
             "junit_path": str(junit_file) if junit_file.exists() else None,
-            "coverage_path": str(coverage_file) if coverage_file.exists() else None,
+            "coverage_path": str(coverage_file)
+            if coverage_file.exists()
+            else None,
             "logs_path": str(logs_file),
             "error": "Test execution timed out",
         }
-        
+
     except Exception as e:
         logger.error(f"Failed to run tests for {label}: {e}")
         return {
@@ -139,11 +151,13 @@ def run_tests(
             "coverage_total": 0.0,
             "coverage_by_file": {},
             "junit_path": str(junit_file) if junit_file.exists() else None,
-            "coverage_path": str(coverage_file) if coverage_file.exists() else None,
+            "coverage_path": str(coverage_file)
+            if coverage_file.exists()
+            else None,
             "logs_path": str(logs_file) if logs_file.exists() else None,
             "error": str(e),
         }
-    
+
     finally:
         # Clean up copied tests
         if workspace_tests_dir.exists():
@@ -153,41 +167,41 @@ def run_tests(
 def _parse_test_results(
     result: subprocess.CompletedProcess,
     junit_file: Path,
-    coverage_file: Path, 
+    coverage_file: Path,
     logs_file: Path,
     label: str,
 ) -> Dict:
     """Parse test execution results."""
-    
+
     # Parse pytest output for basic stats
     tests_passed = 0
     tests_failed = 0
-    
+
     if result.stdout:
-        lines = result.stdout.split('\n')
+        lines = result.stdout.split("\n")
         for line in lines:
-            if ' passed' in line and ' failed' in line:
+            if " passed" in line and " failed" in line:
                 # Line like "2 failed, 3 passed in 1.23s"
                 parts = line.split()
                 for i, part in enumerate(parts):
-                    if part == 'passed' and i > 0:
-                        tests_passed = int(parts[i-1])
-                    elif part == 'failed' and i > 0:
-                        tests_failed = int(parts[i-1])
-            elif ' passed' in line and 'failed' not in line:
+                    if part == "passed" and i > 0:
+                        tests_passed = int(parts[i - 1])
+                    elif part == "failed" and i > 0:
+                        tests_failed = int(parts[i - 1])
+            elif " passed" in line and "failed" not in line:
                 # Line like "5 passed in 1.23s"
                 parts = line.split()
                 for i, part in enumerate(parts):
-                    if part == 'passed' and i > 0:
-                        tests_passed = int(parts[i-1])
-    
+                    if part == "passed" and i > 0:
+                        tests_passed = int(parts[i - 1])
+
     # Parse coverage from XML if available
     coverage_total = 0.0
     coverage_by_file = {}
-    
+
     if coverage_file.exists():
         coverage_total, coverage_by_file = _parse_coverage_xml(coverage_file)
-    
+
     return {
         "label": label,
         "tests_passed": tests_passed,
@@ -195,7 +209,9 @@ def _parse_test_results(
         "coverage_total": coverage_total,
         "coverage_by_file": coverage_by_file,
         "junit_path": str(junit_file) if junit_file.exists() else None,
-        "coverage_path": str(coverage_file) if coverage_file.exists() else None,
+        "coverage_path": str(coverage_file)
+        if coverage_file.exists()
+        else None,
         "logs_path": str(logs_file),
         "return_code": result.returncode,
     }
@@ -205,54 +221,60 @@ def _parse_coverage_xml(coverage_file: Path) -> tuple[float, Dict[str, float]]:
     """Parse coverage XML file."""
     try:
         import xml.etree.ElementTree as ET
-        
+
         tree = ET.parse(coverage_file)
         root = tree.getroot()
-        
+
         # Debug: log the XML structure
         logger.info(f"Coverage XML root tag: {root.tag}")
         logger.info(f"Coverage XML root attribs: {root.attrib}")
-        
+
         # Get overall coverage - try different formats
         coverage_total = 0.0
-        
+
         # Modern pytest-cov uses 'coverage' as root element
-        if root.tag == 'coverage':
-            line_rate = root.get('line-rate', '0')
-            if line_rate != '0':
+        if root.tag == "coverage":
+            line_rate = root.get("line-rate", "0")
+            if line_rate != "0":
                 coverage_total = float(line_rate) * 100
                 logger.info(f"Found line-rate in coverage root: {line_rate}")
         else:
             # Try finding coverage element nested
-            coverage_element = root.find('.//coverage')
+            coverage_element = root.find(".//coverage")
             if coverage_element is not None:
-                line_rate = coverage_element.get('line-rate', '0')
+                line_rate = coverage_element.get("line-rate", "0")
                 coverage_total = float(line_rate) * 100
-                logger.info(f"Found coverage element with line-rate: {line_rate}")
-        
+                logger.info(
+                    f"Found coverage element with line-rate: {line_rate}"
+                )
+
         # If still no coverage found, try branches-valid attribute (alternative format)
         if coverage_total == 0.0:
-            branches_valid = root.get('branches-valid', '0')
-            branches_covered = root.get('branches-covered', '0') 
-            lines_valid = root.get('lines-valid', '0')
-            lines_covered = root.get('lines-covered', '0')
-            
-            if lines_valid != '0':
+            branches_valid = root.get("branches-valid", "0")
+            branches_covered = root.get("branches-covered", "0")
+            lines_valid = root.get("lines-valid", "0")
+            lines_covered = root.get("lines-covered", "0")
+
+            if lines_valid != "0":
                 line_coverage = float(lines_covered) / float(lines_valid)
                 coverage_total = line_coverage * 100
-                logger.info(f"Calculated coverage from lines: {lines_covered}/{lines_valid} = {coverage_total:.2f}%")
-        
+                logger.info(
+                    f"Calculated coverage from lines: {lines_covered}/{lines_valid} = {coverage_total:.2f}%"
+                )
+
         # Get per-file coverage
         coverage_by_file = {}
-        for class_elem in root.findall('.//class'):
-            filename = class_elem.get('filename', '')
-            line_rate = class_elem.get('line-rate', '0')
+        for class_elem in root.findall(".//class"):
+            filename = class_elem.get("filename", "")
+            line_rate = class_elem.get("line-rate", "0")
             if filename:
                 coverage_by_file[filename] = float(line_rate) * 100
-        
-        logger.info(f"Parsed coverage: {coverage_total}% total, {len(coverage_by_file)} files")
+
+        logger.info(
+            f"Parsed coverage: {coverage_total}% total, {len(coverage_by_file)} files"
+        )
         return coverage_total, coverage_by_file
-        
+
     except Exception as e:
         logger.warning(f"Failed to parse coverage XML: {e}")
-        return 0.0, {}
\ No newline at end of file
+        return 0.0, {}
diff --git a/qualityflow/steps/select_input.py b/qualityflow/steps/select_input.py
index c274e8df..ff16e391 100644
--- a/qualityflow/steps/select_input.py
+++ b/qualityflow/steps/select_input.py
@@ -3,6 +3,7 @@
 """
 
 from typing import Annotated, Dict
+
 from zenml import step
 from zenml.logger import get_logger
 
@@ -17,22 +18,22 @@ def select_input(
 ) -> Annotated[Dict[str, str], "source_spec"]:
     """
     Resolve source specification for test generation.
-    
+
     Args:
         repo_url: Repository URL to analyze
         ref: Git reference (branch, tag, commit)
         target_glob: Glob pattern for target files
-        
+
     Returns:
         Source specification dictionary
     """
     logger.info(f"Selecting input source: {repo_url}@{ref}")
-    
+
     spec = {
         "repo_url": repo_url,
         "ref": ref,
         "target_glob": target_glob,
     }
-    
+
     logger.info(f"Source spec: {spec}")
-    return spec
\ No newline at end of file
+    return spec

From f04f49072838d75fae5eee08165460257e64dfc5 Mon Sep 17 00:00:00 2001
From: Hamza Tahir <hamza@zenml.io>
Date: Sun, 24 Aug 2025 22:33:07 +0200
Subject: [PATCH 3/8] Update loading prompt template and add log for missing
 file

---
 qualityflow/steps/gen_tests_agent.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py
index 9a918afc..5026891f 100644
--- a/qualityflow/steps/gen_tests_agent.py
+++ b/qualityflow/steps/gen_tests_agent.py
@@ -62,19 +62,24 @@ def gen_tests_agent(
     tests_dir = tempfile.mkdtemp(prefix="qualityflow_agent_tests_")
     tests_path = Path(tests_dir)
 
-    # Load prompt template
-    workspace_path = Path(workspace_dir)
-    prompt_file = workspace_path / prompt_path
+    # Load prompt template from QualityFlow project directory
+    # Note: workspace_dir is the cloned repo, but prompts are in QualityFlow project
+    project_root = Path(__file__).parent.parent  # Go up from steps/ to project root
+    prompt_file = project_root / prompt_path
 
     if prompt_file.exists():
         with open(prompt_file, "r") as f:
             prompt_template = f.read()
+        logger.info(f"Loaded prompt template from {prompt_file}")
     else:
         # Use default template if file doesn't exist
         prompt_template = _get_default_prompt_template()
-        logger.info(f"Using default prompt template, {prompt_path} not found")
+        logger.info(f"Using default prompt template, {prompt_path} not found at {prompt_file}")
 
     template = Template(prompt_template)
+    
+    # Keep workspace_path for reading source files
+    workspace_path = Path(workspace_dir)
 
     total_tokens_in = 0
     total_tokens_out = 0

From da184b478bc17da8817e877e5eefbfda69cd956a Mon Sep 17 00:00:00 2001
From: Hamza Tahir <hamza@zenml.io>
Date: Mon, 25 Aug 2025 08:30:28 +0200
Subject: [PATCH 4/8] Remove unnecessary 'ast' requirement from file

---
 qualityflow/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qualityflow/requirements.txt b/qualityflow/requirements.txt
index 72c59212..2d3f977e 100644
--- a/qualityflow/requirements.txt
+++ b/qualityflow/requirements.txt
@@ -12,7 +12,7 @@ pytest-cov>=4.0.0,<5.0.0
 coverage>=7.0.0,<8.0.0
 
 # Code Analysis
-ast>=3.9
+# ast is built-in, no need to install
 
 # Git Integration
 gitpython>=3.1.0,<4.0.0

From 44c095455f075a231f88ab8bcce54291dff44add Mon Sep 17 00:00:00 2001
From: Hamza Tahir <htahir111@gmail.com>
Date: Mon, 25 Aug 2025 11:23:16 +0200
Subject: [PATCH 5/8] Update test generation pipeline for QualityFlow.-
 Improved configurations and code organization

---
 qualityflow/README.md                   | 110 ++++++++----------------
 qualityflow/run.py                      |  12 ++-
 qualityflow/steps/fetch_source.py       |   3 +
 qualityflow/steps/gen_tests_agent.py    |  10 ++-
 qualityflow/steps/gen_tests_baseline.py |   4 +
 qualityflow/steps/report.py             |   4 +
 6 files changed, 64 insertions(+), 79 deletions(-)

diff --git a/qualityflow/README.md b/qualityflow/README.md
index 490e9aa2..b7a76022 100644
--- a/qualityflow/README.md
+++ b/qualityflow/README.md
@@ -63,37 +63,39 @@ The main pipeline handles the complete test generation workflow:
 └─────────────────────────────────────────────────────────────────┘
 ```
 
-## 📦 Quick Start
+## 🚀 Quick Start
 
-### Prerequisites
-
-- Python 3.9+
-- ZenML installed (`pip install zenml`)
-- Git
-- OpenAI API key (optional, can use fake provider)
-
-### Setup
+Get QualityFlow running in 3 simple steps:
 
+### 1. Install Dependencies
 ```bash
 pip install -r requirements.txt
 ```
 
-2. **Set up OpenAI (optional)**:
+### 2. Optional: Set up OpenAI API Key
 ```bash
 export OPENAI_API_KEY="your-api-key-here"
 ```
+*Skip this step to use the fake provider for testing*
 
-3. **Run the pipeline**:
+### 3. Run the Pipeline
 ```bash
 python run.py
 ```
 
-That's it! The pipeline will:
-- Clone the configured repository (default: requests library)
-- Analyze Python files and select candidates
-- Generate tests using OpenAI (or fake provider if no API key)
+**That's it!** The pipeline will automatically:
+- Clone a sample repository (requests library by default)
+- Analyze Python files and select test candidates
+- Generate tests using LLM or fake provider
 - Run tests and measure coverage
-- Generate a comprehensive report comparing approaches
+- Create a detailed comparison report
+
+### What Happens Next?
+
+- Check the ZenML dashboard to see pipeline results
+- View generated test files and coverage reports
+- Compare LLM vs baseline test approaches
+- Experiment with different configurations
 
 ## ⚙️ Configuration
 
@@ -171,18 +173,17 @@ Requirements:
 
 ### A/B Testing Experiments
 
-Use run templates for systematic comparisons:
+Compare different configurations by running with different config files:
 
 ```bash
 # Compare prompt versions
-python scripts/run_experiment.py --config configs/experiment.default.yaml
-python scripts/run_experiment.py --config configs/experiment.strict.yaml
+python run.py --config configs/experiment.default.yaml
+python run.py --config configs/experiment.strict.yaml
 
-# Compare in ZenML dashboard:
+# Compare results in ZenML dashboard:
 # - Coverage metrics
 # - Test quality scores  
 # - Token usage and cost
-# - Promotion decisions
 ```
 
 ### Production Deployment
@@ -199,36 +200,23 @@ zenml stack register production_stack \
   -a s3_store -c ecr_registry -o k8s_orchestrator --set
 ```
 
-### Scheduled Regression
-
-Register batch regression for daily execution:
+### Scheduled Execution
 
-```bash
-python scripts/run_batch.py --config configs/schedule.batch.yaml --schedule
-```
+For automated runs, set up scheduled execution using your preferred orchestration tool or ZenML's scheduling features.
 
 ## 🏗️ Project Structure
 
 ```
 qualityflow/
 ├── README.md
-├── pyproject.toml
 ├── requirements.txt
-├── .env.example
-├── zenml.yaml
 │
 ├── configs/                          # Pipeline configurations
 │   ├── experiment.default.yaml       # Standard experiment settings
-│   ├── experiment.strict.yaml        # High-quality gates
-│   └── schedule.batch.yaml           # Batch regression schedule
-│
-├── domain/                           # Core data models
-│   ├── schema.py                     # Pydantic models
-│   └── stages.py                     # Deployment stages
+│   └── experiment.strict.yaml        # High-quality gates
 │
 ├── pipelines/                        # Pipeline definitions
-│   ├── generate_and_evaluate.py      # Experiment pipeline
-│   └── batch_regression.py           # Scheduled regression
+│   └── generate_and_evaluate.py      # Main pipeline
 │
 ├── steps/                            # Pipeline steps
 │   ├── select_input.py               # Source specification
@@ -237,43 +225,27 @@ qualityflow/
 │   ├── gen_tests_agent.py            # LLM test generation
 │   ├── gen_tests_baseline.py         # Heuristic test generation
 │   ├── run_tests.py                  # Test execution & coverage
-│   ├── evaluate_coverage.py          # Metrics & gate evaluation
-│   ├── compare_and_promote.py        # Model registry promotion
-│   ├── resolve_test_pack.py          # Test pack resolution
+│   ├── evaluate_coverage.py          # Metrics evaluation
 │   └── report.py                     # Report generation
 │
 ├── prompts/                          # Jinja2 prompt templates
 │   ├── unit_test_v1.jinja           # Standard test generation
 │   └── unit_test_strict_v2.jinja    # Comprehensive test generation
 │
-├── materializers/                    # Custom artifact handling
-├── utils/                           # Utility functions
-│
-├── registry/                        # Test Pack registry docs
-│   └── README.md
-│
-├── run_templates/                   # Experiment templates
-│   ├── ab_agent_vs_strict.json    # A/B testing configuration
-│   └── baseline_only.json         # Baseline establishment
-│
-├── scripts/                        # CLI scripts
-│   ├── run_experiment.py          # Experiment runner
-│   └── run_batch.py              # Batch regression runner
+├── examples/                         # Demo code for testing
+│   └── toy_lib/                     # Sample library
+│       ├── calculator.py
+│       └── string_utils.py
 │
-└── examples/                       # Demo code for testing
-    └── toy_lib/                   # Sample library
-        ├── calculator.py
-        └── string_utils.py
+└── run.py                           # Main entry point
 ```
 
 ### Key Components
 
-- **Domain Models**: Pydantic schemas for type safety and validation
 - **Pipeline Steps**: Modular, reusable components with clear interfaces
 - **Prompt Templates**: Jinja2 templates for LLM test generation  
-- **Configuration**: YAML-driven experiment and deployment settings
-- **Quality Gates**: Configurable thresholds for coverage and promotion
-- **Model Registry**: ZenML Model Registry integration for test pack versioning
+- **Configuration**: YAML-driven experiment settings
+- **Test Generation**: Both LLM-based and heuristic approaches for comparison
 
 ## 🚀 Production Deployment
 
@@ -295,17 +267,7 @@ zenml stack register production \
 
 ### Scheduled Execution
 
-Set up automated regression testing:
-
-```bash
-# Register schedule (example with ZenML Cloud)
-python scripts/run_batch.py --config configs/schedule.batch.yaml --schedule
-
-# Monitor via dashboard:
-# - Daily regression results
-# - Coverage trend analysis  
-# - Test pack performance
-```
+Set up automated regression testing using ZenML's scheduling capabilities or your preferred orchestration platform.
 
 ## 🤝 Contributing
 
@@ -344,7 +306,7 @@ Run with debug logging:
 
 ```bash
 export ZENML_LOGGING_VERBOSITY=DEBUG
-python scripts/run_experiment.py --config configs/experiment.default.yaml
+python run.py --config configs/experiment.default.yaml
 ```
 
 ## 📚 Resources
diff --git a/qualityflow/run.py b/qualityflow/run.py
index c9ff7370..40366aac 100644
--- a/qualityflow/run.py
+++ b/qualityflow/run.py
@@ -3,6 +3,7 @@
 """
 
 from pathlib import Path
+from typing import Union
 
 import click
 from pipelines import generate_and_evaluate
@@ -26,15 +27,20 @@
     default=False,
     help="Disable pipeline caching and force fresh execution",
 )
-def main(config: str | None, no_cache: bool):
+def main(config: Union[str, None], no_cache: bool):
     """Run QualityFlow test generation and coverage analysis pipeline.
 
     Simple pipeline that generates tests using LLM, runs them, measures coverage,
     and compares results against baseline approaches.
     """
 
-    project_root = Path(__file__).parent
-    default_config = project_root / "configs" / "experiment.default.yaml"
+    try:
+        project_root = Path(__file__).resolve().parent
+        default_config = project_root / "configs" / "experiment.default.yaml"
+    except Exception:
+        # Fallback to current working directory
+        default_config = Path.cwd() / "configs" / "experiment.default.yaml"
+    
     chosen_config = config or str(default_config)
 
     try:
diff --git a/qualityflow/steps/fetch_source.py b/qualityflow/steps/fetch_source.py
index dfbfd609..82a88f5a 100644
--- a/qualityflow/steps/fetch_source.py
+++ b/qualityflow/steps/fetch_source.py
@@ -1,5 +1,8 @@
 """
 Fetch source code workspace step.
+
+This module provides functionality to clone Git repositories and prepare
+workspaces for code analysis and test generation.
 """
 
 import subprocess
diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py
index 5026891f..b995490e 100644
--- a/qualityflow/steps/gen_tests_agent.py
+++ b/qualityflow/steps/gen_tests_agent.py
@@ -64,8 +64,14 @@ def gen_tests_agent(
 
     # Load prompt template from QualityFlow project directory
     # Note: workspace_dir is the cloned repo, but prompts are in QualityFlow project
-    project_root = Path(__file__).parent.parent  # Go up from steps/ to project root
-    prompt_file = project_root / prompt_path
+    try:
+        # Try to resolve project root more robustly
+        current_file = Path(__file__).resolve()
+        project_root = current_file.parent.parent  # Go up from steps/ to project root
+        prompt_file = project_root / prompt_path
+    except Exception:
+        # Fallback to current working directory if path resolution fails
+        prompt_file = Path.cwd() / prompt_path
 
     if prompt_file.exists():
         with open(prompt_file, "r") as f:
diff --git a/qualityflow/steps/gen_tests_baseline.py b/qualityflow/steps/gen_tests_baseline.py
index a3197748..db712e81 100644
--- a/qualityflow/steps/gen_tests_baseline.py
+++ b/qualityflow/steps/gen_tests_baseline.py
@@ -1,5 +1,9 @@
 """
 Generate baseline/skeleton tests using heuristics.
+
+This module creates simple test templates by analyzing Python AST to identify
+functions and classes, generating skeleton test code for comparison with
+LLM-generated tests.
 """
 
 import ast
diff --git a/qualityflow/steps/report.py b/qualityflow/steps/report.py
index 628d5d07..3d7fcdc5 100644
--- a/qualityflow/steps/report.py
+++ b/qualityflow/steps/report.py
@@ -1,5 +1,9 @@
 """
 Generate comprehensive pipeline report.
+
+This module creates detailed markdown reports comparing LLM-generated tests
+against baseline tests, including coverage metrics, quality assessments,
+and recommendations for improvement.
 """
 
 import tempfile

From 7294da94a8c6e524f64877d1267260c4eba8461d Mon Sep 17 00:00:00 2001
From: Hamza Tahir <htahir111@gmail.com>
Date: Mon, 25 Aug 2025 12:16:44 +0200
Subject: [PATCH 6/8] Add local examples testing option

---
 qualityflow/README.md                         |  33 +++-
 qualityflow/configs/experiment.local.yaml     |  40 +++++
 .../pipelines/generate_and_evaluate.py        |   8 +-
 qualityflow/run.py                            |   2 +-
 qualityflow/steps/analyze_code.py             |  11 +-
 qualityflow/steps/fetch_source.py             |  48 +++++-
 qualityflow/steps/gen_tests_agent.py          | 156 ++++++++++++++++--
 qualityflow/steps/report.py                   |  21 +--
 qualityflow/steps/run_tests.py                |   2 -
 9 files changed, 271 insertions(+), 50 deletions(-)
 create mode 100644 qualityflow/configs/experiment.local.yaml

diff --git a/qualityflow/README.md b/qualityflow/README.md
index b7a76022..8b183404 100644
--- a/qualityflow/README.md
+++ b/qualityflow/README.md
@@ -97,6 +97,16 @@ python run.py
 - Compare LLM vs baseline test approaches
 - Experiment with different configurations
 
+### Local Testing Option
+
+For offline development or controlled testing, use the local examples:
+
+```bash
+python run.py --config configs/experiment.local.yaml
+```
+
+This uses the included `examples/toy_lib/` code instead of cloning external repositories.
+
 ## ⚙️ Configuration
 
 ### Key Parameters
@@ -125,14 +135,17 @@ steps:
 ### Pipeline Options
 
 ```bash
-# Use fake provider (no API key needed)
-python run.py  # Uses config defaults
+# Default: uses remote repository (requests library)
+python run.py  
 
-# Force fresh execution (no caching) 
-python run.py --no-cache
+# Local testing with included examples
+python run.py --config configs/experiment.local.yaml
 
-# Use different config
+# High-quality test generation
 python run.py --config configs/experiment.strict.yaml
+
+# Force fresh execution (no caching) 
+python run.py --no-cache
 ```
 
 ## 🔬 Advanced Usage
@@ -213,7 +226,8 @@ qualityflow/
 │
 ├── configs/                          # Pipeline configurations
 │   ├── experiment.default.yaml       # Standard experiment settings
-│   └── experiment.strict.yaml        # High-quality gates
+│   ├── experiment.strict.yaml        # High-quality gates
+│   └── experiment.local.yaml         # Local examples testing
 │
 ├── pipelines/                        # Pipeline definitions
 │   └── generate_and_evaluate.py      # Main pipeline
@@ -233,9 +247,9 @@ qualityflow/
 │   └── unit_test_strict_v2.jinja    # Comprehensive test generation
 │
 ├── examples/                         # Demo code for testing
-│   └── toy_lib/                     # Sample library
-│       ├── calculator.py
-│       └── string_utils.py
+│   └── toy_lib/                     # Sample library with test-friendly code
+│       ├── calculator.py           # Calculator class with edge cases
+│       └── string_utils.py         # String utilities with validation
 │
 └── run.py                           # Main entry point
 ```
@@ -246,6 +260,7 @@ qualityflow/
 - **Prompt Templates**: Jinja2 templates for LLM test generation  
 - **Configuration**: YAML-driven experiment settings
 - **Test Generation**: Both LLM-based and heuristic approaches for comparison
+- **Example Code**: Sample Python modules (`toy_lib`) designed for effective test generation demonstration
 
 ## 🚀 Production Deployment
 
diff --git a/qualityflow/configs/experiment.local.yaml b/qualityflow/configs/experiment.local.yaml
new file mode 100644
index 00000000..477f089e
--- /dev/null
+++ b/qualityflow/configs/experiment.local.yaml
@@ -0,0 +1,40 @@
+# QualityFlow Local Examples Configuration
+# Use local toy_lib examples instead of remote repositories
+
+# Pipeline configuration
+name: "generate_and_evaluate"
+version: "1.0"
+
+# Source configuration - using local examples
+steps:
+  select_input:
+    parameters:
+      # Use local examples instead of remote repo
+      repo_url: "local"
+      ref: "main"
+      target_glob: "examples/**/*.py"  # Target the toy_lib examples
+  
+  analyze_code:
+    parameters:
+      strategy: "all"  # Include all example files
+      max_files: 5  # Process all toy_lib files
+  
+  # LLM generation configuration
+  gen_tests_agent:
+    parameters:
+      provider: "fake"  # Use fake provider by default for local testing
+      model: "gpt-4o-mini"
+      prompt_path: "prompts/unit_test_v1.jinja"
+      max_tests_per_file: 3
+      max_files: 5  # Process all toy_lib files
+  
+  # Baseline test generation
+  gen_tests_baseline:
+    parameters:
+      enabled: true
+      max_files: 5  # Match agent max_files
+
+# Resource configuration
+settings:
+  docker:
+    requirements: requirements.txt
\ No newline at end of file
diff --git a/qualityflow/pipelines/generate_and_evaluate.py b/qualityflow/pipelines/generate_and_evaluate.py
index e359afb8..7050b5bd 100644
--- a/qualityflow/pipelines/generate_and_evaluate.py
+++ b/qualityflow/pipelines/generate_and_evaluate.py
@@ -32,10 +32,12 @@ def generate_and_evaluate() -> None:
     workspace_dir, commit_sha = fetch_source(spec)
 
     # Step 3: Analyze and select code files
-    code_summary = analyze_code(workspace_dir, commit_sha)
+    code_summary = analyze_code(workspace_dir, commit_sha, spec)
 
     # Step 4: Generate tests using LLM agent
-    agent_tests_dir, prompt_used = gen_tests_agent(workspace_dir, code_summary)
+    agent_tests_dir, test_summary = gen_tests_agent(
+        workspace_dir, code_summary
+    )
 
     # Step 5: Generate baseline tests (optional)
     baseline_tests_dir = gen_tests_baseline(workspace_dir, code_summary)
@@ -52,7 +54,7 @@ def generate_and_evaluate() -> None:
     report(
         workspace_dir,
         commit_sha,
-        prompt_used,
+        test_summary,
         agent_results,
         baseline_results,
     )
diff --git a/qualityflow/run.py b/qualityflow/run.py
index 40366aac..b4a9c513 100644
--- a/qualityflow/run.py
+++ b/qualityflow/run.py
@@ -40,7 +40,7 @@ def main(config: Union[str, None], no_cache: bool):
     except Exception:
         # Fallback to current working directory
         default_config = Path.cwd() / "configs" / "experiment.default.yaml"
-    
+
     chosen_config = config or str(default_config)
 
     try:
diff --git a/qualityflow/steps/analyze_code.py b/qualityflow/steps/analyze_code.py
index 3a8f5a14..7cc5822c 100644
--- a/qualityflow/steps/analyze_code.py
+++ b/qualityflow/steps/analyze_code.py
@@ -28,7 +28,7 @@ class SelectionStrategy(str, Enum):
 def analyze_code(
     workspace_dir: Path,
     commit_sha: str,
-    target_glob: str = "src/**/*.py",
+    source_spec: Dict[str, str],
     strategy: SelectionStrategy = SelectionStrategy.LOW_COVERAGE,
     max_files: int = 10,
 ) -> Annotated[Dict, "code_summary"]:
@@ -38,14 +38,19 @@ def analyze_code(
     Args:
         workspace_dir: Path to workspace directory
         commit_sha: Git commit SHA
-        target_glob: Glob pattern for target files
+        source_spec: Source specification containing target_glob and other settings
         strategy: File selection strategy
         max_files: Maximum number of files to select
 
     Returns:
         Code summary dictionary containing selected files and metadata
     """
-    logger.info(f"Analyzing code in {workspace_dir} with strategy {strategy}")
+    # Extract target_glob from source spec
+    target_glob = source_spec.get("target_glob", "src/**/*.py")
+
+    logger.info(
+        f"Analyzing code in {workspace_dir} with strategy {strategy} and glob {target_glob}"
+    )
 
     workspace_path = Path(workspace_dir)
 
diff --git a/qualityflow/steps/fetch_source.py b/qualityflow/steps/fetch_source.py
index 82a88f5a..cdf37548 100644
--- a/qualityflow/steps/fetch_source.py
+++ b/qualityflow/steps/fetch_source.py
@@ -21,7 +21,7 @@ def fetch_source(
     source_spec: Dict[str, str],
 ) -> Tuple[Annotated[Path, "workspace_dir"], Annotated[str, "commit_sha"]]:
     """
-    Fetch and materialize workspace from git repository.
+    Fetch and materialize workspace from git repository or use local examples.
 
     Args:
         source_spec: Source specification from select_input step
@@ -32,9 +32,53 @@ def fetch_source(
     repo_url = source_spec["repo_url"]
     ref = source_spec["ref"]
 
+    # Handle local examples case
+    if repo_url == "local":
+        logger.info("Using local QualityFlow examples")
+        try:
+            # Get the project root (QualityFlow directory)
+            current_file = Path(__file__).resolve()
+            project_root = (
+                current_file.parent.parent
+            )  # Go up from steps/ to project root
+
+            # Create temporary workspace and copy examples
+            workspace_dir = tempfile.mkdtemp(
+                prefix="qualityflow_local_workspace_"
+            )
+            workspace_path = Path(workspace_dir)
+
+            # Copy examples directory to the temporary workspace
+            import shutil
+
+            examples_src = project_root / "examples"
+            examples_dest = workspace_path / "examples"
+
+            if examples_src.exists():
+                shutil.copytree(examples_src, examples_dest)
+                logger.info(
+                    f"Copied examples from {examples_src} to {examples_dest}"
+                )
+            else:
+                logger.warning(
+                    f"Examples directory not found at {examples_src}"
+                )
+
+            commit_sha = "local-examples"
+            logger.info(f"Local workspace ready at {workspace_path}")
+            return workspace_path, commit_sha
+
+        except Exception as e:
+            logger.error(f"Failed to set up local workspace: {e}")
+            # Fallback to current working directory
+            workspace_dir = tempfile.mkdtemp(
+                prefix="qualityflow_fallback_workspace_"
+            )
+            return Path(workspace_dir), "local-fallback"
+
     logger.info(f"Fetching source from {repo_url}@{ref}")
 
-    # Create temporary workspace
+    # Create temporary workspace for remote repositories
     workspace_dir = tempfile.mkdtemp(prefix="qualityflow_workspace_")
     workspace_path = Path(workspace_dir)
 
diff --git a/qualityflow/steps/gen_tests_agent.py b/qualityflow/steps/gen_tests_agent.py
index b995490e..de879731 100644
--- a/qualityflow/steps/gen_tests_agent.py
+++ b/qualityflow/steps/gen_tests_agent.py
@@ -10,6 +10,7 @@
 from jinja2 import Template
 from zenml import log_metadata, step
 from zenml.logger import get_logger
+from zenml.types import MarkdownString
 
 
 class GenerationProvider(str, Enum):
@@ -32,7 +33,10 @@ def gen_tests_agent(
     prompt_path: str = "prompts/unit_test_v1.jinja",
     max_tests_per_file: int = 3,
     max_files: int = 10,
-) -> Tuple[Annotated[Path, "agent_tests_dir"], Annotated[str, "prompt_used"]]:
+) -> Tuple[
+    Annotated[Path, "agent_tests_dir"],
+    Annotated[MarkdownString, "test_summary"],
+]:
     """Generate tests using LLM agent.
 
     Args:
@@ -45,7 +49,7 @@ def gen_tests_agent(
         max_files: Maximum number of files to process (for speed control)
 
     Returns:
-        Tuple of test directory and prompt used
+        Tuple of test directory and test generation summary
     """
     # Extract selected files from code summary
     selected_files = code_summary.get("selected_files", [])
@@ -67,7 +71,9 @@ def gen_tests_agent(
     try:
         # Try to resolve project root more robustly
         current_file = Path(__file__).resolve()
-        project_root = current_file.parent.parent  # Go up from steps/ to project root
+        project_root = (
+            current_file.parent.parent
+        )  # Go up from steps/ to project root
         prompt_file = project_root / prompt_path
     except Exception:
         # Fallback to current working directory if path resolution fails
@@ -80,16 +86,19 @@ def gen_tests_agent(
     else:
         # Use default template if file doesn't exist
         prompt_template = _get_default_prompt_template()
-        logger.info(f"Using default prompt template, {prompt_path} not found at {prompt_file}")
+        logger.info(
+            f"Using default prompt template, {prompt_path} not found at {prompt_file}"
+        )
 
     template = Template(prompt_template)
-    
+
     # Keep workspace_path for reading source files
     workspace_path = Path(workspace_dir)
 
     total_tokens_in = 0
     total_tokens_out = 0
-    materialized_prompts = {}  # Store materialized prompts per file
+    test_snippets = {}  # Store test snippets per file
+    test_stats = {}  # Store test statistics per file
 
     for file_path in files_to_process:
         logger.info(f"Generating tests for {file_path}")
@@ -109,8 +118,15 @@ def gen_tests_agent(
             ),
         )
 
-        # Store the materialized prompt for this file
-        materialized_prompts[file_path] = materialized_prompt
+        # Store test generation info for this file
+        test_stats[file_path] = {
+            "provider": provider.value,
+            "model": model,
+            "max_tests": max_tests_per_file,
+            "complexity_score": code_summary.get("complexity_scores", {}).get(
+                file_path, 0
+            ),
+        }
 
         # Generate tests using provider
         if provider == GenerationProvider.FAKE:
@@ -138,9 +154,26 @@ def gen_tests_agent(
         with open(test_file_path, "w") as f:
             f.write(generated_tests)
 
+        # Store test snippet for summary (first 20 lines)
+        test_lines = generated_tests.split("\n")
+        snippet_lines = test_lines[:20]
+        if len(test_lines) > 20:
+            snippet_lines.append("... (truncated)")
+        test_snippets[file_path] = "\n".join(snippet_lines)
+
+        # Update test stats with actual counts
+        test_stats[file_path]["lines_generated"] = len(test_lines)
+        test_stats[file_path]["test_functions"] = len(
+            [
+                line
+                for line in test_lines
+                if line.strip().startswith("def test_")
+            ]
+        )
+
         logger.info(f"Generated tests saved to {test_file_path}")
 
-    # Log comprehensive metadata including materialized prompts
+    # Log comprehensive metadata
     metadata = {
         "token_usage": {
             "tokens_in": total_tokens_in,
@@ -156,8 +189,7 @@ def gen_tests_agent(
             "max_tests_per_file": max_tests_per_file,
             "files_processed": len(files_to_process),
         },
-        "materialized_prompts": materialized_prompts,
-        "prompt_template": prompt_template,
+        "test_stats": test_stats,
     }
 
     log_metadata(metadata)
@@ -165,11 +197,104 @@ def gen_tests_agent(
         f"Test generation complete. Files: {len(files_to_process)}, Tokens: {total_tokens_in} in / {total_tokens_out} out"
     )
 
-    # Create a better prompt summary for the report
-    prompt_summary = f"Template: {prompt_path}\nProvider: {provider.value}\nModel: {model}\nFiles processed: {len(files_to_process)}"
+    # Create test generation summary
+    test_summary = _create_test_summary(
+        provider,
+        model,
+        prompt_path,
+        files_to_process,
+        test_snippets,
+        test_stats,
+        total_tokens_in,
+        total_tokens_out,
+    )
 
     # Return Path object - ZenML will automatically materialize the folder
-    return Path(tests_dir), prompt_summary
+    return Path(tests_dir), test_summary
+
+
+def _create_test_summary(
+    provider: GenerationProvider,
+    model: str,
+    prompt_path: str,
+    files_processed: list,
+    test_snippets: Dict[str, str],
+    test_stats: Dict[str, Dict],
+    total_tokens_in: int,
+    total_tokens_out: int,
+) -> MarkdownString:
+    """Create a markdown summary of test generation results."""
+
+    # Calculate totals
+    total_lines = sum(
+        stats.get("lines_generated", 0) for stats in test_stats.values()
+    )
+    total_test_functions = sum(
+        stats.get("test_functions", 0) for stats in test_stats.values()
+    )
+
+    # Handle edge case of no files processed
+    if len(files_processed) == 0:
+        summary = f"""# 🧪 Test Generation Summary
+
+## Configuration
+- **Provider**: {provider.value}
+- **Model**: {model}
+- **Prompt Template**: {prompt_path}
+- **Files Processed**: 0
+
+## Generation Statistics
+⚠️ **No files were processed for test generation.**
+
+This could happen if:
+- No files matched the target glob pattern
+- All files were filtered out during analysis
+- Max files limit was set to 0
+
+**Token Usage**: {total_tokens_in:,} in / {total_tokens_out:,} out
+"""
+        return MarkdownString(summary)
+
+    # Build markdown content for successful processing
+    avg_tests = total_test_functions / len(files_processed)
+    summary = f"""# 🧪 Test Generation Summary
+
+## Configuration
+- **Provider**: {provider.value}
+- **Model**: {model}
+- **Prompt Template**: {prompt_path}
+- **Files Processed**: {len(files_processed)}
+
+## Generation Statistics
+- **Total Lines Generated**: {total_lines:,}
+- **Total Test Functions**: {total_test_functions}
+- **Average Tests per File**: {avg_tests:.1f}
+- **Token Usage**: {total_tokens_in:,} in / {total_tokens_out:,} out
+
+## Generated Tests by File
+
+"""
+
+    for file_path in files_processed:
+        stats = test_stats.get(file_path, {})
+        snippet = test_snippets.get(file_path, "")
+
+        complexity = stats.get("complexity_score", 0)
+        lines = stats.get("lines_generated", 0)
+        test_count = stats.get("test_functions", 0)
+
+        summary += f"""### 📄 `{file_path}`
+**Complexity Score**: {complexity:.1f} | **Lines**: {lines} | **Test Functions**: {test_count}
+
+```
+{snippet}
+```
+
+---
+
+"""
+
+    return MarkdownString(summary)
 
 
 def _get_default_prompt_template() -> str:
@@ -197,9 +322,6 @@ def _generate_fake_tests(
     file_path: str, source_code: str, max_tests: int
 ) -> Tuple[str, Dict]:
     """Generate fake/mock tests for development/testing."""
-    # Create a simple module name from file path
-    module_name = file_path.replace("/", ".").replace(".py", "")
-
     test_content = f'''"""
 Generated tests for {file_path}
 """
diff --git a/qualityflow/steps/report.py b/qualityflow/steps/report.py
index 3d7fcdc5..ab8e564d 100644
--- a/qualityflow/steps/report.py
+++ b/qualityflow/steps/report.py
@@ -22,7 +22,7 @@
 def report(
     workspace_dir: Path,
     commit_sha: str,
-    prompt_used: str,
+    test_summary: MarkdownString,
     agent_results: Dict,
     baseline_results: Optional[Dict],
 ) -> Annotated[MarkdownString, "final_report"]:
@@ -32,12 +32,12 @@ def report(
     Args:
         workspace_dir: Workspace directory path
         commit_sha: Git commit SHA
-        prompt_used: Prompt template used
+        test_summary: Test generation summary with snippets
         agent_results: Agent test results
         baseline_results: Baseline test results (optional)
 
     Returns:
-        Path to generated markdown report
+        Markdown report as string
     """
     logger.info("Generating pipeline execution report")
 
@@ -55,7 +55,7 @@ def report(
     report_content = _generate_report_content(
         workspace_dir,
         commit_sha,
-        prompt_used,
+        test_summary,
         agent_results,
         baseline_results,
         evaluation_metrics,
@@ -131,7 +131,7 @@ def _evaluate_coverage_metrics(
 def _generate_report_content(
     workspace_dir: Path,
     commit_sha: str,
-    prompt_used: str,
+    test_summary: MarkdownString,
     agent_results: Dict,
     baseline_results: Optional[Dict],
     evaluation_metrics: Dict,
@@ -236,15 +236,10 @@ def _generate_report_content(
     else:
         report += "📉 **Baseline performs as well or better** - review agent configuration.\n"
 
-    # Configuration section
-    report += """## Configuration
+    # Test generation details section
+    report += f"""## Test Generation Details
 
-### Prompt Template
-```
-"""
-    report += prompt_used[:500] + ("..." if len(prompt_used) > 500 else "")
-    report += """
-```
+{test_summary}
 
 ### File Coverage Details
 """
diff --git a/qualityflow/steps/run_tests.py b/qualityflow/steps/run_tests.py
index 4c8697f3..4ad3edc9 100644
--- a/qualityflow/steps/run_tests.py
+++ b/qualityflow/steps/run_tests.py
@@ -250,8 +250,6 @@ def _parse_coverage_xml(coverage_file: Path) -> tuple[float, Dict[str, float]]:
 
         # If still no coverage found, try branches-valid attribute (alternative format)
         if coverage_total == 0.0:
-            branches_valid = root.get("branches-valid", "0")
-            branches_covered = root.get("branches-covered", "0")
             lines_valid = root.get("lines-valid", "0")
             lines_covered = root.get("lines-covered", "0")
 

From 5db0a78d8ce099c9c46308be880140673c1825bf Mon Sep 17 00:00:00 2001
From: Hamza Tahir <htahir111@gmail.com>
Date: Mon, 25 Aug 2025 12:26:01 +0200
Subject: [PATCH 7/8] Update project Dockerfile link and add QualityFlow
 project

---
 ADDING_PROJECTS.md           | 2 +-
 README.md                    | 1 +
 llm-complete-guide/README.md | 4 ++--
 qualityflow/README.md        | 3 ---
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/ADDING_PROJECTS.md b/ADDING_PROJECTS.md
index a169560d..f1c81b9d 100644
--- a/ADDING_PROJECTS.md
+++ b/ADDING_PROJECTS.md
@@ -57,7 +57,7 @@ ENV ZENML_ENABLE_TUTORIAL=true
 ### When No Dockerfile is Needed
 
 If your project only requires Python dependencies listed in `requirements.txt`, **do not include a Dockerfile**. The projects backend will automatically build your project using the generic Dockerfile available at:
-[https://github.com/zenml-io/zenml-projects-backend/blob/main/.docker/project.Dockerfile](https://github.com/zenml-io/zenml-projects-backend/blob/main/.docker/project.Dockerfile)
+[https://raw.githubusercontent.com/zenml-io/zenml-projects-backend/refs/heads/main/.docker/project.Dockerfile?token=GHSAT0AAAAAADISFM36XGBCROFV7ZUEFSUK2FMHITA](https://raw.githubusercontent.com/zenml-io/zenml-projects-backend/refs/heads/main/.docker/project.Dockerfile?token=GHSAT0AAAAAADISFM36XGBCROFV7ZUEFSUK2FMHITA)
 
 ## 🔧 Backend Integration
 
diff --git a/README.md b/README.md
index 27b762ba..46accab2 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,7 @@ etc.
 | [Nightwatch AI](nightwatch-ai)                               | 🤖 LLMOps | 📝 Summarization, 📊 Reporting                              | openai, supabase, slack              |
 | [ResearchRadar](research-radar)                              | 🤖 LLMOps | 📝 Classification, 📊 Comparison                            | anthropic, huggingface, transformers |
 | [Deep Research](deep_research)                   | 🤖 LLMOps | 📝 Research, 📊 Reporting, 🔍 Web Search                     | anthropic, mcp, agents, openai |
+| [QualityFlow](qualityflow)                                   | 🤖 LLMOps | 🧪 Test Generation, 📊 Coverage Analysis, ⚡ Automation      | openai, anthropic, pytest, jinja2    |
 | [End-to-end Computer Vision](end-to-end-computer-vision)     | 👁 CV      | 🔎 Object Detection, 🏷️ Labeling                            | pytorch, label_studio, yolov8        |
 | [Magic Photobooth](magic-photobooth)                         | 👁 CV      | 📷 Image Gen, 🎞️ Video Gen                                  | stable-diffusion, huggingface        |
 | [OmniReader](omni-reader)                                    | 👁 CV      | 📑 OCR, 📊 Evaluation, ⚙️ Batch Processing                  | polars, litellm, openai, ollama      |
diff --git a/llm-complete-guide/README.md b/llm-complete-guide/README.md
index f352d2bf..7fd23bad 100644
--- a/llm-complete-guide/README.md
+++ b/llm-complete-guide/README.md
@@ -235,7 +235,7 @@ python run.py synthetic
 
 You will also need to have set up and connected to an Argilla instance for this
 to work. Please follow the instructions in the [Argilla
-documentation](https://docs.argilla.io/latest/getting_started/quickstart/)
+documentation](https://docs.v1.argilla.io/en/latest/)
 to set up and connect to an Argilla instance on the Hugging Face Hub. [ZenML's
 Argilla integration
 documentation](https://docs.zenml.io/v/docs/stack-components/annotators/argilla)
@@ -254,7 +254,7 @@ zenml secret update llm-complete -v '{"argilla_api_key": "YOUR_ARGILLA_API_KEY",
 
 As with the previous pipeline, you will need to have set up and connected to an Argilla instance for this
 to work. Please follow the instructions in the [Argilla
-documentation](https://docs.argilla.io/latest/getting_started/quickstart/)
+documentation](https://docs.v1.argilla.io/en/latest/)
 to set up and connect to an Argilla instance on the Hugging Face Hub. [ZenML's
 Argilla integration
 documentation](https://docs.zenml.io/v/docs/stack-components/annotators/argilla)
diff --git a/qualityflow/README.md b/qualityflow/README.md
index 8b183404..dd69e58e 100644
--- a/qualityflow/README.md
+++ b/qualityflow/README.md
@@ -289,7 +289,6 @@ Set up automated regression testing using ZenML's scheduling capabilities or you
 QualityFlow follows ZenML best practices and is designed to be extended:
 
 1. **Add New LLM Providers**: Extend `gen_tests_agent.py` with new provider integrations
-2. **Custom Materializers**: Create materializers for new artifact types
 3. **Additional Metrics**: Expand evaluation capabilities with new quality metrics
 4. **Selection Strategies**: Add new code selection algorithms
 
@@ -327,8 +326,6 @@ python run.py --config configs/experiment.default.yaml
 ## 📚 Resources
 
 - [ZenML Documentation](https://docs.zenml.io/)
-- [Model Control Plane](https://docs.zenml.io/user-guide/model-control-plane)
-- [Kubernetes Orchestrator](https://docs.zenml.io/stacks/stack-components/orchestrators/kubernetes)
 
 ---
 

From 80fbe1273b869497102b1ea4424fe2eb6f21fcfb Mon Sep 17 00:00:00 2001
From: Hamza Tahir <htahir111@gmail.com>
Date: Mon, 25 Aug 2025 12:28:42 +0200
Subject: [PATCH 8/8] Update link in ADDING_PROJECTS.md to
 zenml-projects-backend

---
 ADDING_PROJECTS.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ADDING_PROJECTS.md b/ADDING_PROJECTS.md
index f1c81b9d..35f26d3b 100644
--- a/ADDING_PROJECTS.md
+++ b/ADDING_PROJECTS.md
@@ -56,8 +56,7 @@ ENV ZENML_ENABLE_TUTORIAL=true
 
 ### When No Dockerfile is Needed
 
-If your project only requires Python dependencies listed in `requirements.txt`, **do not include a Dockerfile**. The projects backend will automatically build your project using the generic Dockerfile available at:
-[https://raw.githubusercontent.com/zenml-io/zenml-projects-backend/refs/heads/main/.docker/project.Dockerfile?token=GHSAT0AAAAAADISFM36XGBCROFV7ZUEFSUK2FMHITA](https://raw.githubusercontent.com/zenml-io/zenml-projects-backend/refs/heads/main/.docker/project.Dockerfile?token=GHSAT0AAAAAADISFM36XGBCROFV7ZUEFSUK2FMHITA)
+If your project only requires Python dependencies listed in `requirements.txt`, **do not include a Dockerfile**. The projects backend will automatically build your project using the generic Dockerfile available at the zenml-projects-backend repo.
 
 ## 🔧 Backend Integration