Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c749b1f
fix: https://github.com/thisisartium/continuous-alignment-testing/iss…
paulz Mar 11, 2025
47bc9f8
fix: ModuleNotFoundError: No module named 'settings'
paulz Mar 11, 2025
403cc20
add src folder to pytest path
paulz Mar 11, 2025
2b44e70
fix: update path for test runs in cat-test-examples.yml
paulz Mar 11, 2025
43917e6
fix: improve description and output message for rounds input in cat-t…
paulz Mar 11, 2025
b294288
fix: AI tests appear hanging
paulz Mar 11, 2025
ecb8490
fix: update folder references in cat-test-examples.yml for AI tests
paulz Mar 11, 2025
47c4fa5
fix: rename folder variable for readibility
paulz Mar 11, 2025
82343f4
fix: Input required and not supplied: path
paulz Mar 11, 2025
958ed92
refactor: generate_choices function to remove client parameter and im…
paulz Mar 11, 2025
5f5cea8
refactor: report margin of error for readability
paulz Mar 11, 2025
7eb3f63
fix: type warning
paulz Mar 11, 2025
5ed982b
fix: mypy
paulz Mar 11, 2025
e1e3bd9
fix: input rounds should override defaults
paulz Mar 11, 2025
03e2362
fix: simplify run count logic in CI configuration
paulz Mar 11, 2025
7fbe679
fix: add debug output for GitHub ref name in CI configuration
paulz Mar 11, 2025
151288b
fix: bash
paulz Mar 11, 2025
65f81d2
fix: bash with Claude
paulz Mar 11, 2025
84f139b
fix: bash with Claude
paulz Mar 11, 2025
f3fe388
fix: bash with Claude
paulz Mar 11, 2025
4e29c3e
fix: CI svg snapshot
paulz Mar 11, 2025
d4c7b32
fix: CI generates different SVG
paulz Mar 11, 2025
7113e81
fix: CI generates different PNG
paulz Mar 11, 2025
194f3c8
debug: unit tests
paulz Mar 11, 2025
65813a8
fix: CI hangs on unit tests
paulz Mar 11, 2025
29f7569
fix: CI hangs on unit tests
paulz Mar 11, 2025
dde706e
skip creation date metadata
paulz Mar 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 29 additions & 19 deletions .github/workflows/cat-test-examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
workflow_dispatch:
inputs:
rounds:
description: "Number of Rounds"
description: "Number of Rounds 1 - 128"
type: number
required: true
default: 10
Expand All @@ -15,6 +15,8 @@ jobs:
ai_tests:
name: AI Tests
runs-on: ubuntu-latest
env:
TEST_RESULTS_FOLDER: examples/team_recommender/test_runs

steps:
- uses: actions/checkout@v4
Expand All @@ -41,21 +43,30 @@ jobs:
- name: Set number of runs
id: set-number-of-runs
run: |
ROUNDS=${{ inputs.rounds || 10 }}
[[ $GITHUB_REF_NAME == ci-experiment* ]] && ROUNDS=1
[[ "${GITHUB_REF_NAME}" =~ ^ci-experiment/ ]] && ROUNDS=1 || ROUNDS=10
ROUNDS=${INPUT_ROUNDS:-$ROUNDS}

if [ "$ROUNDS" -gt 128 ] || [ "$ROUNDS" -le 0 ]
then
echo "Invalid number of rounds: $ROUNDS"
exit 1
fi

echo "::notice::Starting $ROUNDS runs"
echo "::notice::Starting ${ROUNDS} run$([ "$ROUNDS" -eq 1 ] || echo "s")"
echo "number_of_runs=$ROUNDS" >> "$GITHUB_OUTPUT"
echo "CAT_AI_SAMPLE_SIZE=$ROUNDS" >> $GITHUB_ENV

- name: Run Example tests
run: uv run pytest examples/team_recommender/tests/example_7_*
run: >
uv run pytest
--verbose --verbosity=10 --capture=no --tb=native --color=yes --showlocals
examples/team_recommender/tests/example_7_*
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

# - name: Upload artifacts to MinIO
# run: |
# zip -r test-output-${{ github.run_number }}.zip examples/team_recommender/tests/test_runs
# zip -r test-output-${{ github.run_number }}.zip "examples/team_recommender/test_runs"
# curl -X PUT -T "/path/to/yourfile.zip" \
# -H "Host: localhost:9000" \
# -H "Date: $(date -R)" \
Expand All @@ -65,32 +76,31 @@ jobs:

- name: Show CAT AI Statistical Report
if: always()
run: |
FOLDER=examples/team_recommender/tests/test_runs
FAILURE_COUNT=$(find "$FOLDER" -type f -name "fail-*" | wc -l)
run: |
FAILURE_COUNT=$(find "$TEST_RESULTS_FOLDER" -type f -name "fail-*.json" | wc -l)
PYTHONPATH=src uv run python -m cat_ai.reporter \
"$FAILURE_COUNT" \
"$CAT_AI_SAMPLE_SIZE" \
>> "$GITHUB_STEP_SUMMARY"

- name: Upload main artifacts to Google Drive
if: always() && github.ref == 'refs/heads/main'
if: always() && github.ref_name == 'main'
run: |
zip -r "$FILENAME" examples/team_recommender/tests/test_runs
uv run python src/cat_ai/publish_to_gdrive.py "$FILENAME"
zip -r "$ZIP_WITH_RUN" "$TEST_RESULTS_FOLDER"
uv run python src/cat_ai/publish_to_gdrive.py "$ZIP_WITH_RUN"
env:
PARENT_FOLDER_IDS: ${{ vars.GOOGLE_DRIVE_TEST_OUTPUT_FOLDER_ID }}
FILENAME: test-output-${{ github.run_number }}.zip
ZIP_WITH_RUN: test-output-${{ github.run_number }}.zip

- name: Upload artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: test-output-${{ github.run_number }}
path: examples/team_recommender/tests/test_runs
path: ${{ env.TEST_RESULTS_FOLDER }}

# - name: Debugging with tmate
# if: failure()
# uses: lhotari/action-upterm@v1
# with:
# wait-timeout-minutes: 5
- name: Debugging with tmate
if: failure()
uses: lhotari/action-upterm@v1
with:
wait-timeout-minutes: 5
8 changes: 6 additions & 2 deletions .github/workflows/python-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@ jobs:
run: uv sync --all-extras --dev

- name: Run unit tests
run: uv run pytest

run: >
uv run pytest
--timeout=10
--color=yes
--verbose --verbosity=10 --capture=no --tb=native --showlocals

- name: Type check Python code
run: uv run mypy src
6 changes: 6 additions & 0 deletions examples/team_recommender/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from dotenv import load_dotenv
import sys
from pathlib import Path

# Load environment variables from .env file
load_dotenv()

source_folder = str((Path(__file__).parent / "src").resolve())
print("source_folder", source_folder)
sys.path.append(source_folder)
60 changes: 60 additions & 0 deletions examples/team_recommender/src/retry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import time
import logging
from functools import wraps
from typing import Any, Callable, TypeVar, Optional, Tuple, Type, Union, Dict, List

T = TypeVar('T')
logger = logging.getLogger(__name__)

def retry(
max_attempts: int = 3,
exceptions: Tuple[Type[Exception], ...] = (Exception,),
initial_delay: float = 1.0,
backoff_factor: float = 2.0,
logger_name: Optional[str] = None
) -> Callable:
"""
Retry decorator with exponential backoff for handling transient errors.

Args:
max_attempts: Maximum number of attempts (including first try)
exceptions: Tuple of exception types to catch and retry
initial_delay: Initial delay between retries in seconds
backoff_factor: Multiplier for delay after each retry
logger_name: Optional logger name for custom logging

Returns:
Decorated function with retry logic
"""
local_logger = logger
if logger_name:
local_logger = logging.getLogger(logger_name)

def decorator(func: Callable[..., T]) -> Callable[..., T]:
@wraps(func)
def wrapper(*args: Any, **kwargs: Any) -> T:
attempt = 1
current_delay = initial_delay

while True:
try:
return func(*args, **kwargs)
except exceptions as e:
if attempt >= max_attempts:
local_logger.error(
f"Failed after {max_attempts} attempts: {e.__class__.__name__}: {str(e)}"
)
raise

local_logger.warning(
f"Attempt {attempt}/{max_attempts} failed with {e.__class__.__name__}: {str(e)}. "
f"Retrying in {current_delay:.2f}s..."
)

time.sleep(current_delay)
current_delay *= backoff_factor
attempt += 1

return wrapper

return decorator
14 changes: 14 additions & 0 deletions examples/team_recommender/src/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pathlib import Path


def root_path() -> Path:
"""Returns the absolute path to the root of the project."""
return Path(__file__).parent.parent.resolve()


def root_dir() -> str:
"""Returns the absolute path to the root directory of the project."""
return str(root_path())


ROOT_DIR = root_dir()
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import json
import os
from typing import List

import openai
from jsonschema import FormatChecker, validate
from openai import OpenAI
from tests.settings import ROOT_DIR
from openai.types.chat.chat_completion import Choice

from settings import root_path, root_dir
from retry import retry

from cat_ai.reporter import Reporter
from cat_ai.runner import Runner
Expand Down Expand Up @@ -44,7 +48,7 @@ def load_json_fixture(file_name: str) -> dict:
:param file_name: Name of the JSON file to load.
:return: Parsed JSON data as a dictionary.
"""
json_path = os.path.join(ROOT_DIR, "fixtures", file_name)
json_path = root_path() / "tests" / "fixtures" / file_name
with open(json_path, "r") as file:
return json.load(file)

Expand Down Expand Up @@ -91,30 +95,18 @@ def test_response_has_valid_schema():
It will find exciting moments from sports highlights videos.
"""

client = OpenAI()
assert client is not None

completion = client.chat.completions.create(
model="gpt-4-1106-preview",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": project_description},
],
response_format={"type": "json_object"},
n=generations,
)
responses = completion.choices
responses = generate_choices(generations, project_description, system_prompt)

results = []
for run in range(0, generations):
response = responses[run].message.content
test_reporter = Reporter(
"test_fast_with_n_generations",
f"test_fast_with_{generations}_generation{'' if generations == 1 else 's'}",
metadata={
"system_prompt": system_prompt,
"user_prompt": project_description,
},
output_dir=ROOT_DIR,
output_dir=root_dir(),
)
test_runner = Runner(
lambda reporter: run_allocation_test(
Expand All @@ -128,7 +120,31 @@ def test_response_has_valid_schema():
assert has_expected_success_rate(results, failure_threshold)


def run_allocation_test(reporter, skills_data, response) -> bool:
@retry(
max_attempts=4,
exceptions=(openai.APIConnectionError,),
initial_delay=1.0,
backoff_factor=2.0,
logger_name="openai.api",
)
def generate_choices(generations, project_description, system_prompt) -> List[Choice]:
client = OpenAI()
assert client is not None

completion = client.chat.completions.create(
model="gpt-4-1106-preview",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": project_description},
],
response_format={"type": "json_object"},
n=generations,
)
responses = completion.choices
return responses


def run_allocation_test(reporter: Reporter, skills_data, response: str) -> bool:
acceptable_people = ["Sam Thomas", "Drew Anderson", "Alex Wilson", "Alex Johnson"]
all_developers = get_all_developer_names(skills_data)

Expand All @@ -138,6 +154,7 @@ def run_allocation_test(reporter, skills_data, response) -> bool:
not_empty_response = True
no_developer_name_is_hallucinated = True
developer_is_appropriate = True
json_object = {}
try:
json_object = json.loads(response)
has_valid_json_schema = response_matches_json_schema(json_object, schema)
Expand Down
4 changes: 0 additions & 4 deletions examples/team_recommender/tests/settings.py

This file was deleted.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ dev = [
"pydrive2>=1.21.3,<2",
"pydantic>=2.10.6,<3",
"ruff>=0.9.10",
"pytest-timeout>=2.3.1",
]

[tool.uv]
Expand All @@ -46,6 +47,7 @@ default-groups = ["test", "examples", "dev"]
asyncio_mode = "auto"
pythonpath = [".", "src"]
testpaths = ["tests"]
faulthandler_timeout="100"

[tool.mypy]
python_version = "3.13"
Expand Down
21 changes: 11 additions & 10 deletions src/cat_ai/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,24 +61,25 @@ def report(self, response: str, results: Dict[str, bool]) -> bool:
return final_result

@staticmethod
def format_summary(analysis: StatisticalAnalysis) -> str:
def format_summary(to_report: StatisticalAnalysis) -> str:
"""
Format the statistical analysis as a markdown string.

Args:
analysis: StatisticalAnalysis object containing analysis data
to_report: StatisticalAnalysis object containing analysis data

Returns:
str: Formatted string with the error margin calculations and confidence interval
"""
output = f"> [!NOTE]\n"
output += f"> ### There are {analysis.failure_count} failures out of {analysis.sample_size} generations.\n"
output += f"> Sample Proportion (p̂): {analysis.proportion:.4f}\n"
output += f"> Standard Error (SE): {analysis.standard_error:.6f}\n"
output += f"> Margin of Error (ME): {analysis.margin_of_error:.6f}\n"
output += f"> 90% Confidence Interval: [{analysis.confidence_interval_prop[0]:.6f}, {analysis.confidence_interval_prop[1]:.6f}]\n"
output += f"> 90% Confidence Interval (Count): [{analysis.confidence_interval_count[0]}, {analysis.confidence_interval_count[1]}]"

output = "> [!NOTE]\n"
output += f"> ## {to_report.failure_count} ± {to_report.margin_of_error_count} failures detected ({to_report.sample_size} samples)\n"
output += "> \n"
output += f"> **90% Confidence Range:** {to_report.confidence_interval_count[0]}-{to_report.confidence_interval_count[1]} failures\n"
output += "> \n"
output += "> **Details:**\n"
output += f"> - Proportion: {to_report.proportion:.4f} [{to_report.confidence_interval_prop[0]:.4f}, {to_report.confidence_interval_prop[1]:.4f}]\n"
output += f"> - Standard Error: {to_report.standard_error:.4f}\n"
output += f"> - Margin of Error: {to_report.margin_of_error:.4f}\n"
return output


Expand Down
Loading
Loading