thisisartium · cauri · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025
diff --git a/.github/workflows/cat-test-examples.yml b/.github/workflows/cat-test-examples.yml
@@ -6,7 +6,7 @@ on:
   workflow_dispatch:
     inputs:
       rounds:
-        description: "Number of Rounds"
+        description: "Number of Rounds 1 - 128"
         type: number
         required: true
         default: 10
@@ -15,6 +15,8 @@ jobs:
   ai_tests:
     name: AI Tests
     runs-on: ubuntu-latest
+    env:
+      TEST_RESULTS_FOLDER: examples/team_recommender/test_runs
 
     steps:
       - uses: actions/checkout@v4
@@ -41,21 +43,30 @@ jobs:
       - name: Set number of runs
         id: set-number-of-runs
         run: |
-          ROUNDS=${{ inputs.rounds || 10 }}
-          [[ $GITHUB_REF_NAME == ci-experiment* ]] && ROUNDS=1
+          [[ "${GITHUB_REF_NAME}" =~ ^ci-experiment/ ]] && ROUNDS=1 || ROUNDS=10
+          ROUNDS=${INPUT_ROUNDS:-$ROUNDS}
+
+          if [ "$ROUNDS" -gt 128 ] || [ "$ROUNDS" -le 0 ]
+          then
+            echo "Invalid number of rounds: $ROUNDS"
+            exit 1
+          fi
 
-          echo "::notice::Starting $ROUNDS runs"
+          echo "::notice::Starting ${ROUNDS} run$([ "$ROUNDS" -eq 1 ] || echo "s")"
           echo "number_of_runs=$ROUNDS" >> "$GITHUB_OUTPUT"
           echo "CAT_AI_SAMPLE_SIZE=$ROUNDS" >> $GITHUB_ENV
 
       - name: Run Example tests
-        run: uv run pytest examples/team_recommender/tests/example_7_*
+        run: >
+          uv run pytest
+          --verbose --verbosity=10 --capture=no --tb=native --color=yes --showlocals
+          examples/team_recommender/tests/example_7_*
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
 #      - name: Upload artifacts to MinIO
 #        run: |
-#          zip -r test-output-${{ github.run_number }}.zip examples/team_recommender/tests/test_runs
+#          zip -r test-output-${{ github.run_number }}.zip "examples/team_recommender/test_runs"
 #            curl -X PUT -T "/path/to/yourfile.zip" \
 #            -H "Host: localhost:9000" \
 #            -H "Date: $(date -R)" \
@@ -65,32 +76,31 @@ jobs:
 
       - name: Show CAT AI Statistical Report
         if: always()
-        run: |
-          FOLDER=examples/team_recommender/tests/test_runs
-          FAILURE_COUNT=$(find "$FOLDER" -type f -name "fail-*" | wc -l)
+        run: |          
+          FAILURE_COUNT=$(find "$TEST_RESULTS_FOLDER" -type f -name "fail-*.json" | wc -l)
           PYTHONPATH=src uv run python -m cat_ai.reporter \
             "$FAILURE_COUNT" \
             "$CAT_AI_SAMPLE_SIZE" \
             >> "$GITHUB_STEP_SUMMARY"
 
       - name: Upload main artifacts to Google Drive
-        if: always() && github.ref == 'refs/heads/main'
+        if: always() && github.ref_name == 'main'
         run: |
-          zip -r "$FILENAME" examples/team_recommender/tests/test_runs
-          uv run python src/cat_ai/publish_to_gdrive.py "$FILENAME"
+          zip -r "$ZIP_WITH_RUN" "$TEST_RESULTS_FOLDER"
+          uv run python src/cat_ai/publish_to_gdrive.py "$ZIP_WITH_RUN"
         env:
           PARENT_FOLDER_IDS: ${{ vars.GOOGLE_DRIVE_TEST_OUTPUT_FOLDER_ID }}
-          FILENAME: test-output-${{ github.run_number }}.zip
+          ZIP_WITH_RUN: test-output-${{ github.run_number }}.zip
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
         if: always()
         with:
             name: test-output-${{ github.run_number }}
-            path: examples/team_recommender/tests/test_runs
+            path: ${{ env.TEST_RESULTS_FOLDER }}
 
-#      - name: Debugging with tmate
-#        if: failure()
-#        uses: lhotari/action-upterm@v1
-#        with:
-#          wait-timeout-minutes: 5
+      - name: Debugging with tmate
+        if: failure()
+        uses: lhotari/action-upterm@v1
+        with:
+          wait-timeout-minutes: 5
diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
@@ -30,7 +30,11 @@ jobs:
         run: uv sync --all-extras --dev
 
       - name: Run unit tests
-        run: uv run pytest
-
+        run: >
+          uv run pytest
+          --timeout=10
+          --color=yes
+          --verbose --verbosity=10 --capture=no --tb=native --showlocals 
+
       - name: Type check Python code
         run: uv run mypy src
diff --git a/examples/team_recommender/conftest.py b/examples/team_recommender/conftest.py
@@ -1,4 +1,10 @@
 from dotenv import load_dotenv
+import sys
+from pathlib import Path
 
 # Load environment variables from .env file
 load_dotenv()
+
+source_folder = str((Path(__file__).parent / "src").resolve())
+print("source_folder", source_folder)
+sys.path.append(source_folder)
diff --git a/examples/team_recommender/src/retry.py b/examples/team_recommender/src/retry.py
@@ -0,0 +1,60 @@
+import time
+import logging
+from functools import wraps
+from typing import Any, Callable, TypeVar, Optional, Tuple, Type, Union, Dict, List
+
+T = TypeVar('T')
+logger = logging.getLogger(__name__)
+
+def retry(
+    max_attempts: int = 3,
+    exceptions: Tuple[Type[Exception], ...] = (Exception,),
+    initial_delay: float = 1.0,
+    backoff_factor: float = 2.0,
+    logger_name: Optional[str] = None
+) -> Callable:
+    """
+    Retry decorator with exponential backoff for handling transient errors.
+
+    Args:
+        max_attempts: Maximum number of attempts (including first try)
+        exceptions: Tuple of exception types to catch and retry
+        initial_delay: Initial delay between retries in seconds
+        backoff_factor: Multiplier for delay after each retry
+        logger_name: Optional logger name for custom logging
+
+    Returns:
+        Decorated function with retry logic
+    """
+    local_logger = logger
+    if logger_name:
+        local_logger = logging.getLogger(logger_name)
+
+    def decorator(func: Callable[..., T]) -> Callable[..., T]:
+        @wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> T:
+            attempt = 1
+            current_delay = initial_delay
+
+            while True:
+                try:
+                    return func(*args, **kwargs)
+                except exceptions as e:
+                    if attempt >= max_attempts:
+                        local_logger.error(
+                            f"Failed after {max_attempts} attempts: {e.__class__.__name__}: {str(e)}"
+                        )
+                        raise
+
+                    local_logger.warning(
+                        f"Attempt {attempt}/{max_attempts} failed with {e.__class__.__name__}: {str(e)}. "
+                        f"Retrying in {current_delay:.2f}s..."
+                    )
+
+                    time.sleep(current_delay)
+                    current_delay *= backoff_factor
+                    attempt += 1
+
+        return wrapper
+
+    return decorator
diff --git a/examples/team_recommender/src/settings.py b/examples/team_recommender/src/settings.py
@@ -0,0 +1,14 @@
+from pathlib import Path
+
+
+def root_path() -> Path:
+    """Returns the absolute path to the root of the project."""
+    return Path(__file__).parent.parent.resolve()
+
+
+def root_dir() -> str:
+    """Returns the absolute path to the root directory of the project."""
+    return str(root_path())
+
+
+ROOT_DIR = root_dir()
diff --git a/...ples/team_recommender/tests/example_7_schema_validators/test_response_has_valid_schema.py b/...ples/team_recommender/tests/example_7_schema_validators/test_response_has_valid_schema.py
@@ -1,9 +1,13 @@
 import json
-import os
+from typing import List
 
+import openai
 from jsonschema import FormatChecker, validate
 from openai import OpenAI
-from tests.settings import ROOT_DIR
+from openai.types.chat.chat_completion import Choice
+
+from settings import root_path, root_dir
+from retry import retry
 
 from cat_ai.reporter import Reporter
 from cat_ai.runner import Runner
@@ -44,7 +48,7 @@ def load_json_fixture(file_name: str) -> dict:
     :param file_name: Name of the JSON file to load.
     :return: Parsed JSON data as a dictionary.
     """
-    json_path = os.path.join(ROOT_DIR, "fixtures", file_name)
+    json_path = root_path() / "tests" / "fixtures" / file_name
     with open(json_path, "r") as file:
         return json.load(file)
 
@@ -91,30 +95,18 @@ def test_response_has_valid_schema():
     It will find exciting moments from sports highlights videos.
     """
 
-    client = OpenAI()
-    assert client is not None
-
-    completion = client.chat.completions.create(
-        model="gpt-4-1106-preview",
-        messages=[
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": project_description},
-        ],
-        response_format={"type": "json_object"},
-        n=generations,
-    )
-    responses = completion.choices
+    responses = generate_choices(generations, project_description, system_prompt)
 
     results = []
     for run in range(0, generations):
         response = responses[run].message.content
         test_reporter = Reporter(
-            "test_fast_with_n_generations",
+            f"test_fast_with_{generations}_generation{'' if generations == 1 else 's'}",
             metadata={
                 "system_prompt": system_prompt,
                 "user_prompt": project_description,
             },
-            output_dir=ROOT_DIR,
+            output_dir=root_dir(),
         )
         test_runner = Runner(
             lambda reporter: run_allocation_test(
@@ -128,7 +120,31 @@ def test_response_has_valid_schema():
     assert has_expected_success_rate(results, failure_threshold)
 
 
-def run_allocation_test(reporter, skills_data, response) -> bool:
+@retry(
+    max_attempts=4,
+    exceptions=(openai.APIConnectionError,),
+    initial_delay=1.0,
+    backoff_factor=2.0,
+    logger_name="openai.api",
+)
+def generate_choices(generations, project_description, system_prompt) -> List[Choice]:
+    client = OpenAI()
+    assert client is not None
+
+    completion = client.chat.completions.create(
+        model="gpt-4-1106-preview",
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": project_description},
+        ],
+        response_format={"type": "json_object"},
+        n=generations,
+    )
+    responses = completion.choices
+    return responses
+
+
+def run_allocation_test(reporter: Reporter, skills_data, response: str) -> bool:
     acceptable_people = ["Sam Thomas", "Drew Anderson", "Alex Wilson", "Alex Johnson"]
     all_developers = get_all_developer_names(skills_data)
 
@@ -138,6 +154,7 @@ def run_allocation_test(reporter, skills_data, response) -> bool:
     not_empty_response = True
     no_developer_name_is_hallucinated = True
     developer_is_appropriate = True
+    json_object = {}
     try:
         json_object = json.loads(response)
         has_valid_json_schema = response_matches_json_schema(json_object, schema)

diff --git a/examples/team_recommender/tests/settings.py b/examples/team_recommender/tests/settings.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,6 +37,7 @@ dev = [
     "pydrive2>=1.21.3,<2",
     "pydantic>=2.10.6,<3",
     "ruff>=0.9.10",
+    "pytest-timeout>=2.3.1",
 ]
 
 [tool.uv]
@@ -46,6 +47,7 @@ default-groups = ["test", "examples", "dev"]
 asyncio_mode = "auto"
 pythonpath = [".", "src"]
 testpaths = ["tests"]
+faulthandler_timeout="100"
 
 [tool.mypy]
 python_version = "3.13"

diff --git a/src/cat_ai/reporter.py b/src/cat_ai/reporter.py
@@ -61,24 +61,25 @@ def report(self, response: str, results: Dict[str, bool]) -> bool:
         return final_result
 
     @staticmethod
-    def format_summary(analysis: StatisticalAnalysis) -> str:
+    def format_summary(to_report: StatisticalAnalysis) -> str:
         """
         Format the statistical analysis as a markdown string.
 
         Args:
-            analysis: StatisticalAnalysis object containing analysis data
+            to_report: StatisticalAnalysis object containing analysis data
 
         Returns:
             str: Formatted string with the error margin calculations and confidence interval
         """
-        output = f"> [!NOTE]\n"
-        output += f"> ### There are {analysis.failure_count} failures out of {analysis.sample_size} generations.\n"
-        output += f"> Sample Proportion (p̂): {analysis.proportion:.4f}\n"
-        output += f"> Standard Error (SE): {analysis.standard_error:.6f}\n"
-        output += f"> Margin of Error (ME): {analysis.margin_of_error:.6f}\n"
-        output += f"> 90% Confidence Interval: [{analysis.confidence_interval_prop[0]:.6f}, {analysis.confidence_interval_prop[1]:.6f}]\n"
-        output += f"> 90% Confidence Interval (Count): [{analysis.confidence_interval_count[0]}, {analysis.confidence_interval_count[1]}]"
-
+        output = "> [!NOTE]\n"
+        output += f"> ## {to_report.failure_count} ± {to_report.margin_of_error_count} failures detected ({to_report.sample_size} samples)\n"
+        output += "> \n"
+        output += f"> **90% Confidence Range:** {to_report.confidence_interval_count[0]}-{to_report.confidence_interval_count[1]} failures\n"
+        output += "> \n"
+        output += "> **Details:**\n"
+        output += f"> - Proportion: {to_report.proportion:.4f} [{to_report.confidence_interval_prop[0]:.4f}, {to_report.confidence_interval_prop[1]:.4f}]\n"
+        output += f"> - Standard Error: {to_report.standard_error:.4f}\n"
+        output += f"> - Margin of Error: {to_report.margin_of_error:.4f}\n"
         return output