Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions docs/local-development.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ The first step will be just to be able to run the first version of your prompt a

Imagine we have a python project called `team_recommender` where we recommend teams of developers to be used on a given project. The basic structure looks like this:

```
team_recommender/
├── README.md
├── requirements.txt
Expand All @@ -21,7 +22,7 @@ team_recommender/
├── __init__.py
├── test_allocations.py
└── settings.py

```

## Single Test
We start running a test prompt inside a pytest to check that the LLM is recommending us developers that we think have relevant skills based off our fixture data:
Expand Down Expand Up @@ -183,7 +184,7 @@ def test_allocations():
run_allocation_test,
reporter=test_reporter,
)
results = test_runner.run_loop(tries)
results = test_runner.run_multiple(tries)
assert False not in results


Expand Down Expand Up @@ -336,7 +337,7 @@ def test_allocations():
),
reporter=test_reporter,
)
results = test_runner.run_loop(tries)
results = test_runner.run_multiple(tries)
assert False not in results


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,5 @@ def test_allocations():
response_format={"type": "json_object"},
)
response = completion.choices[0].message.content
person_with_relevant_skill_was_selected = any(
name in response for name in acceptable_people
)
person_with_relevant_skill_was_selected = any(name in response for name in acceptable_people)
assert person_with_relevant_skill_was_selected

Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_allocations():
run_allocation_test,
reporter=test_reporter,
)
results = test_runner.run_loop(tries)
results = test_runner.run_multiple(tries)
assert False not in results


Expand All @@ -69,4 +69,3 @@ def run_allocation_test(reporter) -> bool:
except json.JSONDecodeError as e:
print(f"JSON Exception: {e}")
return result

Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@


def get_all_developer_names(skills_data) -> set[str]:
return {
developer["developer"]["name"]
for skill in skills_data["skills"]
for developer in skill["developerSkills"]
}
return {developer["developer"]["name"] for skill in skills_data["skills"] for developer in skill["developerSkills"]}


def get_developer_names_from_response(response) -> set[str]:
Expand Down Expand Up @@ -52,12 +48,10 @@ def test_allocations():
output_dir=ROOT_DIR,
)
test_runner = Runner(
lambda reporter: run_allocation_test(
reporter=reporter, skills_data=skills_data
),
lambda reporter: run_allocation_test(reporter=reporter, skills_data=skills_data),
reporter=test_reporter,
)
results = test_runner.run_loop(tries)
results = test_runner.run_multiple(tries)
assert False not in results


Expand All @@ -82,9 +76,7 @@ def run_allocation_test(reporter, skills_data) -> bool:
try:
json_object = json.loads(response)
developer_names = get_developer_names_from_response(json_object)
no_developer_name_is_hallucinated = False not in [
name in all_developers for name in developer_names
]
no_developer_name_is_hallucinated = False not in [name in all_developers for name in developer_names]

reporter.report(
json_object,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
import json
import os

from openai import OpenAI
from tests.settings import ROOT_DIR

from cat_ai.reporter import Reporter
from cat_ai.runner import Runner
from tests.settings import ROOT_DIR
from openai import OpenAI


def get_all_developer_names(skills_data) -> set[str]:
return {
developer["developer"]["name"]
for skill in skills_data["skills"]
for developer in skill["developerSkills"]
}
return {developer["developer"]["name"] for skill in skills_data["skills"] for developer in skill["developerSkills"]}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know this is just what your formatter does but which one do you actually think is easier to read?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

may be 120 lines is too big? at 100 formatter would be multiline. what you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does that mean you prefer the multi-line version? I'm open to either.



def get_developer_names_from_response(response) -> set[str]:
return {developer["name"] for developer in response["developers"]}


def has_expected_success_rate(results: list[bool], expected_success_rate: float) -> bool:
if not results:
return True
Expand All @@ -27,8 +26,9 @@ def has_expected_success_rate(results: list[bool], expected_success_rate: float)
print(1.0 - failure_rate)
return expected_success_rate <= (1.0 - failure_rate)


def test_allocations():
tries = Runner.sample_size(3)
tries = Runner.get_sample_size(3)
skills_json_path = os.path.join(ROOT_DIR, "fixtures", "skills.json")
with open(skills_json_path, "r") as file:
skills_data = json.load(file)
Expand Down Expand Up @@ -61,12 +61,10 @@ def test_allocations():
output_dir=ROOT_DIR,
)
test_runner = Runner(
lambda reporter: run_allocation_test(
reporter=reporter, skills_data=skills_data
),
lambda reporter: run_allocation_test(reporter=reporter, skills_data=skills_data),
reporter=test_reporter,
)
results = test_runner.run_loop(tries)
results = test_runner.run_multiple(tries)
failure_threshold = 0.8
assert has_expected_success_rate(results, failure_threshold)

Expand All @@ -92,9 +90,7 @@ def run_allocation_test(reporter, skills_data) -> bool:
try:
json_object = json.loads(response)
developer_names = get_developer_names_from_response(json_object)
no_developer_name_is_hallucinated = False not in [
name in all_developers for name in developer_names
]
no_developer_name_is_hallucinated = False not in [name in all_developers for name in developer_names]

reporter.report(
json_object,
Expand Down
2 changes: 1 addition & 1 deletion examples/team_recommender/tests/settings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os


ROOT_DIR = os.path.dirname(os.path.abspath(os.path.join("..", __file__)))
ROOT_DIR = os.path.dirname(os.path.abspath(os.path.join("..", __file__)))
27 changes: 27 additions & 0 deletions qodana.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#-------------------------------------------------------------------------------#
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tell me more about this tool.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

built in for PyCharm: https://www.jetbrains.com/qodana/

# Qodana analysis is configured by qodana.yaml file #
# https://www.jetbrains.com/help/qodana/qodana-yaml.html #
#-------------------------------------------------------------------------------#
version: "1.0"
#Specify inspection profile for code analysis
profile:
name: qodana.starter
#Enable inspections
#include:
# - name: <SomeEnabledInspectionId>
#Disable inspections
#exclude:
# - name: <SomeDisabledInspectionId>
# paths:
# - <path/where/not/run/inspection>
#Execute shell command before Qodana execution (Applied in CI/CD pipeline)
#bootstrap: sh ./prepare-qodana.sh
#Install IDE plugins before Qodana execution (Applied in CI/CD pipeline)
#plugins:
# - id: <plugin.id> #(plugin id can be found at https://plugins.jetbrains.com)
#Specify Qodana linter for analysis (Applied in CI/CD pipeline)
linter: jetbrains/qodana-python:2024.3
exclude:
- name: All
paths:
- docs
1 change: 1 addition & 0 deletions src/cat_ai/publish_to_gdrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pydrive2.auth import GoogleAuth # type: ignore
from pydrive2.drive import GoogleDrive # type: ignore


def login_with_service_account(credentials_path: str) -> GoogleAuth:
"""
Google Drive service with a service account.
Expand Down
6 changes: 3 additions & 3 deletions src/cat_ai/reporter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import os
from datetime import datetime
from typing import Any, Dict
from typing import Optional, Any, Dict


class Reporter:
Expand All @@ -14,10 +14,10 @@ def _create_unique_id_from_time() -> str:
return datetime.now().strftime("%Y%m%d_%H%M%S")

def __init__(
self, test_name: str, output_dir: str, unique_id: str | None = None, metadata: Dict[str, Any] = {}
self, test_name: str, output_dir: str, unique_id: str | None = None, metadata: Optional[Dict[str, Any]] = None
) -> None:
self.test_name = test_name
self.metadata = metadata
self.metadata = metadata or {}
if not unique_id:
unique_id = self._create_unique_id_from_time()
unique_dir_name = f"{test_name}-{unique_id}"
Expand Down
58 changes: 46 additions & 12 deletions src/cat_ai/runner.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,59 @@
import os
from typing import Callable, List, Optional

from .reporter import Reporter
from typing import Callable, Any


class Runner:
def __init__(self, test_function: Callable[..., Any], reporter: Reporter) -> None:
"""Executes test functions and collects results using a reporter."""

def __init__(self, test_function: Callable[[Reporter], bool], reporter: Reporter) -> None:
"""
Initialize the Runner with a test function and reporter.

Args:
test_function: Function to execute during test runs
reporter: Reporter instance to track and report test results
"""
self.reporter = reporter
self.test_function = test_function

@staticmethod
def sample_size(default_size: int = 1) -> int:
def get_sample_size(default_size: int = 1) -> int:
"""
Get sample size from environment variable or use default.

Args:
default_size: Default sample size if not specified in environment

Returns:
Number of test runs to perform
"""
return int(os.getenv("CAT_AI_SAMPLE_SIZE", str(default_size)))

def run_once(self, run_number: int = 0) -> Any:
def run_once(self, run_number: int = 0) -> bool:
"""
Execute the test function once.

Args:
run_number: Current run index for reporting

Returns:
Result from the test function
"""
self.reporter.run_number = run_number
result = self.test_function(reporter=self.reporter)
return result

def run_loop(self, tries: int = sample_size()) -> list[Any]:
results = []
for x in range(0, tries):
results.append(self.run_once(x))
return results
return self.test_function(self.reporter)

def run_multiple(self, sample_size: Optional[int] = None) -> List[bool]:
"""
Execute the test function multiple times based on sample size.

Args:
sample_size: Number of times to run the test, defaults to
value from get_sample_size() if None

Returns:
List of results from all test runs
"""
runs = sample_size if sample_size is not None else self.get_sample_size()
return [self.run_once(i) for i in range(runs)]
12 changes: 4 additions & 8 deletions tests/cat_ai/test_reporter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json
import time
from unittest.mock import mock_open, patch, MagicMock
from cat_ai.reporter import Reporter
from cat_ai.helpers.helpers import root_dir
from src.cat_ai.reporter import Reporter
from src.cat_ai.helpers.helpers import root_dir


def test_reporter_create_a_unique_folder_path() -> None:
Expand All @@ -18,18 +18,14 @@ def test_reporter_create_a_unique_folder_path() -> None:
def test_reporter_can_accept_unique_id_override() -> None:
test_name = "id_override"
unique_id = "some_string"
reporter1 = Reporter(
test_name=test_name, output_dir=root_dir(), unique_id=unique_id
)
reporter1 = Reporter(test_name=test_name, output_dir=root_dir(), unique_id=unique_id)
expected_dir_path = f"{root_dir()}/test_runs/{test_name}-{unique_id}"
assert str(expected_dir_path) == str(reporter1.folder_path)


@patch("os.makedirs")
@patch("builtins.open", new_callable=mock_open)
def test_report_creates_correct_json(
mock_open: MagicMock, mock_makedirs: MagicMock
) -> None:
def test_report_creates_correct_json(mock_open: MagicMock, mock_makedirs: MagicMock) -> None:
test_name = "report_creates_correct_json"
unique_id = "20231001_120000"
reporter = Reporter(test_name=test_name, output_dir=root_dir(), unique_id=unique_id)
Expand Down
64 changes: 64 additions & 0 deletions tests/cat_ai/test_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from src.cat_ai.reporter import Reporter
from src.cat_ai.runner import Runner


# Dummy test function that will be passed to Runner
def dummy_test_function(reporter: Reporter) -> bool:
# Imagine that this function does something meaningful
# Simply returning True instead of trying to log
return True


def test_runner_sample_size(monkeypatch):
# Set an environment variable to test
monkeypatch.setenv("CAT_AI_SAMPLE_SIZE", "5")
assert Runner.get_sample_size() == 5

# Test default size
monkeypatch.delenv("CAT_AI_SAMPLE_SIZE", raising=False)
assert Runner.get_sample_size(default_size=3) == 3


def test_run_once():
# Create a Reporter with necessary arguments
reporter = Reporter(test_name="test_run_once", output_dir="/tmp")

# Initialize Runner with dummy test function and Reporter
runner = Runner(test_function=dummy_test_function, reporter=reporter)

# Test run_once
result = runner.run_once()
assert result is True
assert reporter.run_number == 0


def test_run_multiple():
# Create a Reporter with necessary arguments
reporter = Reporter(test_name="test_run", output_dir="/tmp")

# Initialize Runner with dummy test function and Reporter
runner = Runner(test_function=dummy_test_function, reporter=reporter)

# Test with explicit sample size parameter
results = runner.run_multiple(sample_size=2)
assert len(results) == 2
assert all(results)
expected_results = [True, True]
assert results == expected_results


def test_run_with_env_variable(monkeypatch):
# Set the environment variable for a controlled test
monkeypatch.setenv("CAT_AI_SAMPLE_SIZE", "3")

# Create a Reporter with necessary arguments
reporter = Reporter(test_name="test_run_with_env", output_dir="/tmp")

# Initialize Runner with dummy test function and Reporter
runner = Runner(test_function=dummy_test_function, reporter=reporter)

# Test without explicit sample size (should use environment variable)
results = runner.run_multiple()
assert len(results) == 3
expected_results = [True, True, True]
assert results == expected_results