diff --git a/.gitignore b/.gitignore index 0c4b3800f5..d70de45982 100644 --- a/.gitignore +++ b/.gitignore @@ -47,4 +47,5 @@ sdk/agenta/templates/agenta.py web/ee/public/__env.js web/oss/public/__env.js -web/oss/tests/datalayer/results \ No newline at end of file +web/oss/tests/datalayer/results +.* \ No newline at end of file diff --git a/api/ee/tests/manual/evaluations/sdk/quick_start.py b/api/ee/tests/manual/evaluations/sdk/quick_start.py index a60fdebfc9..1d43927fed 100644 --- a/api/ee/tests/manual/evaluations/sdk/quick_start.py +++ b/api/ee/tests/manual/evaluations/sdk/quick_start.py @@ -169,6 +169,7 @@ async def run_evaluation(): # Run evaluation print("Running evaluation...") eval_result = await aevaluate( + name="My First Eval", testsets=[my_testset.id], applications=[capital_quiz_app], evaluators=[ diff --git a/docs/blog/entries/annotate-your-llm-response-preview.mdx b/docs/blog/entries/annotate-your-llm-response-preview.mdx index ac2443ce74..0af9760569 100644 --- a/docs/blog/entries/annotate-your-llm-response-preview.mdx +++ b/docs/blog/entries/annotate-your-llm-response-preview.mdx @@ -20,7 +20,7 @@ This is useful to: - Run custom evaluation workflows - Measure application performance in real-time -Check out the how to [annotate traces from API](/observability/trace-with-python-sdk/annotate-traces) for more details. Or try our new tutorial (available as [jupyter notebook](https://github.com/Agenta-AI/agenta/blob/main/examples/jupyter/capture_user_feedback.ipynb)) [here](/tutorials/cookbooks/capture-user-feedback). +Check out the how to [annotate traces from API](/observability/trace-with-python-sdk/annotate-traces) for more details. Or try our new tutorial (available as [jupyter notebook](https://github.com/Agenta-AI/agenta/blob/main/examples/jupyter/observability/capture_user_feedback.ipynb)) [here](/tutorials/cookbooks/capture-user-feedback). + Open in Google Colaboratory + + +## What You'll Build + +By the end of this guide, you'll have: +- An application that returns country capitals +- Two evaluators that check if answers are correct +- A complete evaluation run with results + +The entire example takes less than 100 lines of code. + +## Prerequisites + +Install the Agenta SDK: + +```bash +pip install agenta +``` + +Set your environment variables: + +```bash +export AGENTA_API_KEY="your-api-key" +export AGENTA_HOST="https://cloud.agenta.ai" +export OPENAI_API_KEY="your-openai-api-key" # Required for LLM-as-a-judge evaluator +``` + +## Step 1: Initialize Agenta + +Create a new Python file and initialize the SDK: + +```python +import agenta as ag + +ag.init() +``` + +## Step 2: Create Your Application + +An application is any function that processes inputs and returns outputs. Use the `@ag.application` decorator to mark your function: + +```python +@ag.application( + slug="capital_finder", + name="Capital Finder", + description="Returns the capital of a given country" +) +async def capital_finder(country: str): + """ + Your application logic goes here. + For this example, we'll use a simple dictionary lookup. + """ + capitals = { + "Germany": "Berlin", + "France": "Paris", + "Spain": "Madrid", + "Italy": "Rome", + } + return capitals.get(country, "Unknown") +``` + +The function receives parameters from your test data. In this case, it gets `country` from the testcase and returns the capital city. + +## Step 3: Create an Evaluator + +An evaluator checks if your application's output is correct. Use the `@ag.evaluator` decorator: + +```python +@ag.evaluator( + slug="exact_match", + name="Exact Match Evaluator", + description="Checks if the output exactly matches the expected answer" +) +async def exact_match(capital: str, outputs: str): + """ + Compare the application's output to the expected answer. + + Args: + capital: The expected answer from the testcase + outputs: What your application returned + + Returns: + A dictionary with score and success flag + """ + is_correct = outputs == capital + return { + "score": 1.0 if is_correct else 0.0, + "success": is_correct, + } +``` + +The evaluator receives two types of inputs: +- Fields from your testcase (like `capital`) +- The application's output (always called `outputs`) + +## Step 4: Create Test Data + +Define your test cases as a list of dictionaries: + +```python +test_data = [ + {"country": "Germany", "capital": "Berlin"}, + {"country": "France", "capital": "Paris"}, + {"country": "Spain", "capital": "Madrid"}, + {"country": "Italy", "capital": "Rome"}, +] +``` + +Each dictionary represents one test case. The keys become parameters that your application and evaluators can access. + +## Step 5: Run the Evaluation + +Import the evaluation functions and run your test: + +```python +import asyncio +from agenta.sdk.evaluations import aevaluate + +async def run_evaluation(): + # Create a testset from your data + testset = await ag.testsets.acreate( + name="Country Capitals", + data=test_data, + ) + + # Run evaluation + result = await aevaluate( + testsets=[testset.id], + applications=[capital_finder], + evaluators=[exact_match], + ) + + return result + +# Run the evaluation +if __name__ == "__main__": + eval_result = asyncio.run(run_evaluation()) + print(f"Evaluation complete!") +``` + +## Complete Example + +Here's the full code in one place: + +```python +import asyncio +import agenta as ag +from agenta.sdk.evaluations import aevaluate + +# Initialize SDK +ag.init() + +# Define test data +test_data = [ + {"country": "Germany", "capital": "Berlin"}, + {"country": "France", "capital": "Paris"}, + {"country": "Spain", "capital": "Madrid"}, + {"country": "Italy", "capital": "Rome"}, +] + +# Create application +@ag.application( + slug="capital_finder", + name="Capital Finder", +) +async def capital_finder(country: str): + capitals = { + "Germany": "Berlin", + "France": "Paris", + "Spain": "Madrid", + "Italy": "Rome", + } + return capitals.get(country, "Unknown") + +# Create evaluator +@ag.evaluator( + slug="exact_match", + name="Exact Match", +) +async def exact_match(capital: str, outputs: str): + is_correct = outputs == capital + return { + "score": 1.0 if is_correct else 0.0, + "success": is_correct, + } + +# Run evaluation +async def main(): + testset = await ag.testsets.acreate( + name="Country Capitals", + data=test_data, + ) + + result = await aevaluate( + testsets=[testset.id], + applications=[capital_finder], + evaluators=[exact_match], + ) + + print(f"Evaluation complete!") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Understanding the Data Flow + +When you run an evaluation, here's what happens: + +1. **Testcase data** flows to the application + - Input: `{"country": "Germany", "capital": "Berlin"}` + - Application receives: `country="Germany"` + - Application returns: `"Berlin"` + +2. **Both testcase data and application output** flow to the evaluator + - Evaluator receives: `capital="Berlin"` (expected answer from testcase) + - Evaluator receives: `outputs="Berlin"` (what the application returned) + - Evaluator compares them and returns: `{"score": 1.0, "success": True}` + +3. **Results are collected** and stored in Agenta + - You can view them in the web interface + - Or access them programmatically from the result object + +## Next Steps + +Now that you've created your first evaluation, you can: + +- Learn how to [configure custom evaluators](/evaluation/evaluation-from-sdk/configuring-evaluators) with different scoring logic +- Explore [built-in evaluators](/evaluation/evaluation-from-sdk/configuring-evaluators#built-in-evaluators) like LLM-as-a-judge +- Understand how to [configure your application](/evaluation/evaluation-from-sdk/configuring-applications) for different use cases +- Run [multiple evaluators](/evaluation/evaluation-from-sdk/running-evaluations) in a single evaluation + +## Common Patterns + +### Using Multiple Evaluators + +You can run several evaluators on the same application: + +```python +result = await aevaluate( + testsets=[testset.id], + applications=[capital_finder], + evaluators=[ + exact_match, + case_insensitive_match, + similarity_check, + ], +) +``` + +Each evaluator runs independently and produces its own scores. + +### Accessing Additional Test Data + +Your evaluators can access any field from the testcase: + +```python +@ag.evaluator(slug="region_aware") +async def region_aware(country: str, region: str, outputs: str): + # You can access multiple fields from the testcase + # and use them in your evaluation logic + pass +``` + +### Returning Multiple Metrics + +Evaluators can return multiple scores: + +```python +@ag.evaluator(slug="detailed_eval") +async def detailed_eval(expected: str, outputs: str): + return { + "exact_match": 1.0 if outputs == expected else 0.0, + "length_diff": abs(len(outputs) - len(expected)), + "success": outputs == expected, + } +``` + +## Getting Help + +If you run into issues: +- Join our [Discord community](https://discord.gg/agenta) +- Open an issue on [GitHub](https://github.com/agenta-ai/agenta) diff --git a/docs/docs/evaluation/evaluation-from-sdk/02-managing-testsets.mdx b/docs/docs/evaluation/evaluation-from-sdk/02-managing-testsets.mdx new file mode 100644 index 0000000000..db8b6239fe --- /dev/null +++ b/docs/docs/evaluation/evaluation-from-sdk/02-managing-testsets.mdx @@ -0,0 +1,279 @@ +--- +title: "Managing Testsets" +sidebar_label: "Managing Testsets" +description: "Learn how to create, list, and retrieve testsets using the Agenta SDK" +sidebar_position: 2 +--- + +import GoogleColabButton from "@site/src/components/GoogleColabButton"; +import Tabs from "@theme/Tabs"; +import TabItem from "@theme/TabItem"; + +This guide covers how to create, list, and retrieve testsets using the Agenta SDK for evaluation purposes. + + + Open in Google Colaboratory + + +## Creating a Testset + +Use `ag.testsets.acreate()` to create a new testset with data: + + + + +```python +import agenta as ag + +# Create a testset with simple data +testset = await ag.testsets.acreate( + data=[ + {"country": "Germany", "capital": "Berlin"}, + {"country": "France", "capital": "Paris"}, + {"country": "Spain", "capital": "Madrid"} + ], + name="Country Capitals", +) + +print(f"Created testset with ID: {testset.id}") +print(f"Name: {testset.name}") +print(f"Slug: {testset.slug}") +``` + + + + +**Parameters:** +- `data`: A list of dictionaries containing your test data. Each dictionary represents one testcase. +- `name`: The name of your testset. + +**Returns:** A `TestsetRevision` object containing: +- `id`: The UUID of the created testset +- `name`: The testset name +- `slug`: The testset slug +- `data`: The test data (with `testcases` structure) + +**Sample Output:** + +```python +{ + "id": "01963413-3d39-7650-80ce-3ad5d688da6c", + "name": "Country Capitals", + "slug": "3ad5d688da6c", + "data": { + "testcases": [ + {"data": {"country": "Germany", "capital": "Berlin"}}, + {"data": {"country": "France", "capital": "Paris"}}, + {"data": {"country": "Spain", "capital": "Madrid"}} + ] + } +} +``` + +:::tip +The `data` parameter accepts a simple list of dictionaries. The SDK automatically converts this to the structured `TestsetRevisionData` format internally. +::: + +## Upserting a Testset + +Use `ag.testsets.aupsert()` to create a new testset or update an existing one if it already exists: + + + + +```python +import agenta as ag + +# Create or update a testset +testset = await ag.testsets.aupsert( + name="Country Capitals", + data=[ + {"country": "Germany", "capital": "Berlin"}, + {"country": "France", "capital": "Paris"}, + {"country": "Spain", "capital": "Madrid"}, + {"country": "Italy", "capital": "Rome"}, + ], +) + +print(f"Upserted testset with ID: {testset.id}") +``` + + + + +**Parameters:** +- `name` (required): The name of your testset. Used to find existing testset. +- `data` (required): A list of dictionaries containing your test data. +- `testset_id` (optional): If provided, updates the testset with this specific ID. + +**Returns:** A `TestsetRevision` object with the created or updated testset. + +:::tip When to use upsert vs create +Use `aupsert()` when you want to update an existing testset with the same name, or create it if it doesn't exist. This is useful in CI/CD pipelines where you want to keep testsets synchronized. Use `acreate()` when you explicitly want to create a new testset every time. +::: + +## Listing Testsets + +To list all testsets in your project, use `ag.testsets.alist()`: + + + + +```python +import agenta as ag + +# List all testsets +testsets = await ag.testsets.alist() + +print(f"Found {len(testsets)} testsets:") +for testset in testsets: + print(f" - {testset.name} (ID: {testset.id})") +``` + + + + +**Parameters:** None required. + +**Returns:** A list of `TestsetRevision` objects, each containing: +- `id`: The testset UUID +- `name`: The testset name +- `slug`: The testset slug +- Additional metadata fields + +**Sample Output:** + +```python +[ + { + "id": "01963413-3d39-7650-80ce-3ad5d688da6c", + "name": "Country Capitals", + "slug": "country-capitals" + }, + { + "id": "01963520-4e4a-8761-91df-4be6e799eb7d", + "name": "Math Problems", + "slug": "math-problems" + } +] +``` + +## Retrieving a Testset by ID + +To retrieve a specific testset by its ID, use `ag.testsets.aretrieve()`: + + + + +```python +import agenta as ag + +# Retrieve a specific testset (using the testset_id from creation) +testset = await ag.testsets.aretrieve(testset_id=testset_id) + +if testset: + print(f"Retrieved testset: {testset.id}") + print(f"Testcases count: {len(testset.data.testcases) if testset.data and testset.data.testcases else 0}") +else: + print("Testset not found") +``` + + + + +**Parameters:** +- `testset_id`: The UUID of the testset to retrieve + +**Returns:** A `TestsetRevision` object (or `None` if not found) containing: +- `id`: The testset revision UUID +- `testset_id`: The parent testset UUID +- `slug`: The revision slug +- `version`: The revision version number +- `data`: The `TestsetRevisionData` with all testcases + +**Sample Output:** + +```python +{ + "id": "01963413-3d39-7650-80ce-3ad5d688da6c", + "testset_id": "01963413-3d39-7650-80ce-3ad5d688da6c", + "slug": "3ad5d688da6c", + "version": "1", + "data": { + "testcases": [ + {"data": {"country": "Germany", "capital": "Berlin"}}, + {"data": {"country": "France", "capital": "Paris"}}, + {"data": {"country": "Spain", "capital": "Madrid"}} + ] + } +} +``` + +:::info +Currently using the legacy testset API. When retrieving a testset, the function returns a `TestsetRevision` object with version "1". In the future, this will support the new versioning system where each update creates a new revision. +::: + +## Retrieving a Testset by Name + +While there's no dedicated function for this, you can easily find a testset by name by filtering the results from `ag.testsets.alist()`: + + + + +```python +import agenta as ag + +async def get_testset_by_name(name: str): + """Helper function to find a testset by name.""" + testsets = await ag.testsets.alist() + + if not testsets: + return None + + for testset in testsets: + if testset.name == name: + return testset + + return None + +# Usage +testset = await get_testset_by_name("Country Capitals") + +if testset: + print(f"Found testset: {testset.name} with ID: {testset.id}") +else: + print("Testset not found") +``` + + + + +:::tip Helper Pattern +This pattern shows how you can implement your own helper functions to filter and find testsets based on custom criteria. You can extend this to search by tags or other metadata fields. +::: + +## Working with Test Data + +Once you have a testset, you can access the testcases within it: + + + + +```python +import agenta as ag + +# Retrieve a testset +testset = await ag.testsets.aretrieve(testset_id=testset_id) + +# Access testcases +if testset and testset.data and testset.data.testcases: + for testcase in testset.data.testcases: + print(f"Testcase: {testcase.data}") + # Use testcase.data in your evaluation +``` + + + + +Each testcase contains a `data` field with the dictionary you provided during creation. You can use these testcases directly in your evaluations. + diff --git a/docs/docs/evaluation/evaluation-from-sdk/03-configuring-applications.mdx b/docs/docs/evaluation/evaluation-from-sdk/03-configuring-applications.mdx new file mode 100644 index 0000000000..92cb6cd32b --- /dev/null +++ b/docs/docs/evaluation/evaluation-from-sdk/03-configuring-applications.mdx @@ -0,0 +1,166 @@ +--- +title: "Configuring Applications" +sidebar_label: "Configuring Applications" +description: "Learn how to define and configure applications for evaluation with the Agenta SDK" +sidebar_position: 3 +--- + +# Configuring Applications + +Applications are the functions you want to evaluate. They receive inputs from your test data and return outputs that evaluators will check. + +## Basic Application Structure + +An application is any Python function decorated with `@ag.application`. The decorator tells Agenta this function should be evaluated. + +```python +import agenta as ag + +ag.init() + +@ag.application( + slug="my_app", + name="My Application", + description="Describes what this application does" +) +async def my_app(input_text: str): + # Your application logic here + result = process(input_text) + return result +``` + +The application decorator takes these parameters: + +- **slug** (required): A unique identifier for your application +- **name** (optional): A human-readable name shown in the UI +- **description** (optional): Explains what the application does + +## Understanding Application Inputs + +Applications receive inputs from your test cases. The function parameters must match field names in your test data. + +**Example:** + +```python +# Your test case +test_case = { + "country": "France", + "language": "French", + "capital": "Paris" +} + +# Your application receives these as parameters +@ag.application(slug="country_info") +async def country_info(country: str, language: str): + # country = "France" + # language = "French" + # Note: capital is not used by this application + return f"The capital of {country} is well known!" +``` + +You only need to declare parameters for the fields your application uses. Extra fields in the test case are ignored. +## Application Return Values + +Applications should return the output you want evaluators to check. The return value can be: + +- **String**: Text responses +- **Dictionary**: Structured data +- **List**: Multiple items +- **Number**: Numeric results +- **Any JSON-serializable value** + +### String Returns + +Most common for text-based applications: + +```python +@ag.application(slug="question_answerer") +async def question_answerer(question: str) -> str: + answer = generate_answer(question) + return answer # Simple string +``` + +### Dictionary Returns + +Useful for structured outputs: + +```python +@ag.application(slug="entity_extractor") +async def entity_extractor(text: str) -> dict: + return { + "entities": ["Paris", "France"], + "count": 2, + "confidence": 0.95 + } +``` + +### List Returns + +For multiple items: + +```python +@ag.application(slug="keyword_extractor") +async def keyword_extractor(text: str) -> list: + keywords = extract_keywords(text) + return keywords # ["keyword1", "keyword2", ...] +``` + +## Application Examples + +### Simple Lookup Application + +```python +@ag.application( + slug="capital_lookup", + name="Capital City Lookup", + description="Returns the capital city for a given country" +) +async def capital_lookup(country: str) -> str: + """Look up a country's capital city.""" + capitals = { + "France": "Paris", + "Germany": "Berlin", + "Spain": "Madrid", + "Italy": "Rome", + } + return capitals.get(country, "Unknown") +``` + +### LLM-Based Application + +```python +import openai + +@ag.application( + slug="question_answerer", + name="Question Answering System", + description="Answers questions using GPT-4" +) +async def question_answerer(question: str, context: str) -> str: + """Answer questions based on provided context.""" + client = openai.AsyncOpenAI() + + response = await client.chat.completions.create( + model="gpt-4", + messages=[ + {"role": "system", "content": "Answer based on the context provided."}, + {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"} + ] + ) + + return response.choices[0].message.content +``` + +## Synchronous vs Asynchronous + +Applications can be either synchronous or asynchronous: + +### Asynchronous (Recommended) + + + +### Synchronous + + +## Working with Application Parameters + diff --git a/docs/docs/evaluation/evaluation-from-sdk/04-configuring-evaluators.mdx b/docs/docs/evaluation/evaluation-from-sdk/04-configuring-evaluators.mdx new file mode 100644 index 0000000000..7ff8e4fb34 --- /dev/null +++ b/docs/docs/evaluation/evaluation-from-sdk/04-configuring-evaluators.mdx @@ -0,0 +1,463 @@ +--- +title: "Configuring Evaluators" +sidebar_label: "Configuring Evaluators" +description: "Learn how to create custom evaluators and use built-in evaluators to check your application's output" +sidebar_position: 4 +--- + +# Configuring Evaluators + +Evaluators are functions that check if your application's output is correct. You can write your own custom evaluators or use Agenta's built-in evaluators. + +## Custom Evaluators + +Custom evaluators are Python functions decorated with `@ag.evaluator`. They receive inputs from your test data and the application's output, then return a dictionary with scores. + +### Basic Structure + +```python +import agenta as ag + +@ag.evaluator( + slug="my_evaluator", + name="My Evaluator", + description="Checks if the output meets my criteria" +) +async def my_evaluator(expected: str, outputs: str): + is_correct = outputs == expected + return { + "score": 1.0 if is_correct else 0.0, + "success": is_correct, + } +``` + +The evaluator decorator takes these parameters: + +- **slug** (required): A unique identifier for your evaluator +- **name** (optional): A human-readable name shown in the UI +- **description** (optional): Explains what the evaluator checks + +### Understanding Evaluator Inputs + +Evaluators receive two types of inputs: + +1. **Test case fields**: Any field from your test data +2. **Application output**: Always called `outputs` + +When you run an evaluation, Agenta passes both the test case data and what your application returned to the evaluator. + +**Example:** + +```python +# Your test case +test_case = { + "question": "What is 2+2?", + "correct_answer": "4", + "difficulty": "easy" +} + +# Your evaluator can access any of these fields +@ag.evaluator(slug="math_checker") +async def math_checker( + correct_answer: str, # From test case + difficulty: str, # From test case + outputs: str # What the application returned +): + # Check if the application's output matches the correct answer + is_correct = outputs == correct_answer + + # You can use other fields in your logic + if difficulty == "easy": + return {"score": 1.0 if is_correct else 0.0} + else: + # More lenient scoring for hard questions + return {"score": 0.5 if is_correct else 0.0} +``` + +### Return Values + +Evaluators must return a dictionary. You can include any metrics you want, but these fields have special meaning: + +- **score**: A numeric value (typically 0.0 to 1.0) indicating quality +- **success**: A boolean flag indicating pass/fail + +```python +@ag.evaluator(slug="detailed_checker") +async def detailed_checker(expected: str, outputs: str): + return { + "score": 0.85, # Overall score + "success": True, # Did it pass? + "length_match": len(outputs) == len(expected), + "exact_match": outputs == expected, + "custom_metric": 42, + } +``` + +All values in the result dictionary are stored and displayed in the evaluation results. + +### Practical Examples + +#### Case-Insensitive Match + +```python +@ag.evaluator( + slug="case_insensitive_match", + name="Case Insensitive Match" +) +async def case_insensitive_match(expected: str, outputs: str): + match = outputs.lower() == expected.lower() + return { + "score": 1.0 if match else 0.0, + "success": match, + } +``` + +#### Length Check + +```python +@ag.evaluator( + slug="length_validator", + name="Length Validator" +) +async def length_validator(outputs: str): + """Check if output is within acceptable length.""" + length = len(outputs) + is_valid = 10 <= length <= 500 + + return { + "success": is_valid, + "length": length, + "score": 1.0 if is_valid else 0.0, + } +``` + +#### Contains Keywords + +```python +@ag.evaluator( + slug="keyword_checker", + name="Keyword Checker" +) +async def keyword_checker(keywords: list[str], outputs: str): + """Check if output contains required keywords.""" + found = [kw for kw in keywords if kw.lower() in outputs.lower()] + score = len(found) / len(keywords) if keywords else 0.0 + + return { + "score": score, + "success": score >= 0.8, + "found_keywords": found, + "missing_keywords": [kw for kw in keywords if kw not in found], + } +``` + +## Built-in Evaluators + +Agenta provides pre-built evaluators for common evaluation tasks. You import them from `agenta.sdk.workflows.builtin` and pass them directly to the `aevaluate()` function. + +### LLM-as-a-Judge + +The LLM-as-a-judge evaluator uses a language model to evaluate your application's output. This is useful when you need nuanced judgments that simple string matching cannot provide. + +```python +from agenta.sdk.workflows import builtin +from agenta.sdk.evaluations import aevaluate + +llm_evaluator = builtin.auto_ai_critique( + slug="quality_evaluator", + name="Quality Evaluator", + description="Uses an LLM to judge response quality", + correct_answer_key="expected_answer", + model="gpt-3.5-turbo", # or "gpt-4", "claude-3-sonnet", etc. + prompt_template=[ + { + "role": "system", + "content": "You are an expert evaluator of AI responses.", + }, + { + "role": "user", + "content": ( + "Expected answer: {{expected_answer}}\n" + "Actual answer: {{outputs}}\n\n" + "Rate the quality of the actual answer from 0.0 to 1.0.\n" + "Respond with ONLY a number, nothing else." + ), + }, + ], +) + +# Use it in evaluation +result = await aevaluate( + testsets=[testset.id], + applications=[my_app], + evaluators=[llm_evaluator], +) +``` + +**Parameters:** + +- **slug** (required): Unique identifier for the evaluator +- **prompt_template** (required): List of message dictionaries with `role` and `content` + - Use `{{field_name}}` placeholders that will be replaced with test case values + - `{{outputs}}` is always available for the application's output +- **correct_answer_key** (optional): Field name in test case containing the expected answer +- **model** (optional): Which LLM to use (default: "gpt-3.5-turbo") +- **name** (optional): Display name +- **description** (optional): Description of what this evaluator checks + +The prompt template uses curly brace syntax `{{variable}}` for placeholders. All fields from your test case are available, plus `{{outputs}}`. + +### String Matching Evaluators + +#### Exact Match + +Checks if the output exactly matches the expected answer. + +```python +from agenta.sdk.workflows import builtin +from agenta.sdk.evaluations import aevaluate + +exact_match = builtin.auto_exact_match( + correct_answer_key="expected" +) + +# Use in evaluation +result = await aevaluate( + testsets=[testset.id], + applications=[my_app], + evaluators=[exact_match], +) +``` + +**Parameters:** +- **correct_answer_key** (optional): Field name in test case with expected value (default: "correct_answer") + +**Returns:** +- `success`: True if output exactly matches expected value + +#### Starts With + +Checks if the output starts with a specific prefix. + +```python +prefix_check = builtin.auto_starts_with( + prefix="Answer:", + case_sensitive=True +) +``` + +**Parameters:** +- **prefix** (required): The string the output should start with +- **case_sensitive** (optional): Whether to match case (default: True) + +**Returns:** +- `success`: True if output starts with the prefix + +#### Ends With + +Checks if the output ends with a specific suffix. + +```python +suffix_check = builtin.auto_ends_with( + suffix="Thank you!", + case_sensitive=False +) +``` + +**Parameters:** +- **suffix** (required): The string the output should end with +- **case_sensitive** (optional): Whether to match case (default: True) + +**Returns:** +- `success`: True if output ends with the suffix + +#### Contains + +Checks if the output contains a substring. + +```python +contains_check = builtin.auto_contains( + substring="important keyword", + case_sensitive=False +) +``` + +**Parameters:** +- **substring** (required): The string to search for +- **case_sensitive** (optional): Whether to match case (default: True) + +**Returns:** +- `success`: True if output contains the substring + +#### Contains Any + +Checks if the output contains at least one of several substrings. + +```python +any_check = builtin.auto_contains_any( + substrings=["hello", "hi", "greetings"], + case_sensitive=False +) +``` + +**Parameters:** +- **substrings** (required): List of strings to search for +- **case_sensitive** (optional): Whether to match case (default: True) + +**Returns:** +- `success`: True if output contains at least one substring + +#### Contains All + +Checks if the output contains all of several substrings. + +```python +all_check = builtin.auto_contains_all( + substrings=["name", "age", "email"], + case_sensitive=False +) +``` + +**Parameters:** +- **substrings** (required): List of strings that must all be present +- **case_sensitive** (optional): Whether to match case (default: True) + +**Returns:** +- `success`: True if output contains all substrings + +### Regex Evaluator + +Checks if the output matches a regular expression pattern. + +```python +regex_check = builtin.auto_regex_test( + regex_pattern=r"\d{3}-\d{3}-\d{4}", # Phone number pattern + regex_should_match=True, + case_sensitive=False +) +``` + +**Parameters:** +- **regex_pattern** (required): Regular expression pattern to test +- **regex_should_match** (optional): Whether pattern should match (default: True) +- **case_sensitive** (optional): Whether to match case (default: True) + +**Returns:** +- `success`: True if pattern match result equals `regex_should_match` + +### JSON Evaluators + +#### Contains JSON + +Checks if the output contains valid JSON. + +```python +json_check = builtin.auto_contains_json() +``` + +**Returns:** +- `success`: True if output contains parseable JSON + +#### JSON Field Match + +Checks if a specific field in JSON output matches the expected value. + +```python +field_check = builtin.field_match_test( + json_field="status", + correct_answer_key="expected_status" +) +``` + +**Parameters:** +- **json_field** (required): Name of field to extract from JSON output +- **correct_answer_key** (optional): Test case field with expected value (default: "correct_answer") + +**Returns:** +- `success`: True if extracted field matches expected value + +#### JSON Diff + +Compares JSON structures and calculates similarity score. + +```python +json_diff = builtin.auto_json_diff( + correct_answer_key="expected_json", + threshold=0.8, + compare_schema_only=False +) +``` + +**Parameters:** +- **correct_answer_key** (optional): Test case field with expected JSON (default: "correct_answer") +- **threshold** (optional): Minimum similarity score to pass (default: 0.5) +- **predict_keys** (optional): Whether to predict which keys to compare (default: False) +- **case_insensitive_keys** (optional): Whether to ignore case in key names (default: False) +- **compare_schema_only** (optional): Only compare structure, not values (default: False) + +**Returns:** +- `score`: Similarity score from 0.0 to 1.0 +- `success`: True if score meets threshold + +### Similarity Evaluators + +#### Levenshtein Distance + +Calculates edit distance between output and expected value. + +```python +levenshtein = builtin.auto_levenshtein_distance( + correct_answer_key="expected", + threshold=0.8, + case_sensitive=False +) +``` + +**Parameters:** +- **correct_answer_key** (optional): Test case field with expected value (default: "correct_answer") +- **case_sensitive** (optional): Whether to match case (default: True) +- **threshold** (optional): Minimum similarity to pass (default: 0.5) + +**Returns:** +- `score`: Normalized similarity score from 0.0 to 1.0 +- `success`: True if score meets threshold + +#### Similarity Match + +Uses Python's SequenceMatcher to calculate similarity. + +```python +similarity = builtin.auto_similarity_match( + correct_answer_key="expected", + threshold=0.75 +) +``` + +**Parameters:** +- **correct_answer_key** (optional): Test case field with expected value (default: "correct_answer") +- **threshold** (optional): Minimum similarity to pass (default: 0.5) + +**Returns:** +- `score`: Similarity ratio from 0.0 to 1.0 +- `success`: True if score meets threshold + +#### Semantic Similarity + +Uses embeddings to measure semantic similarity. + +```python +semantic = builtin.auto_semantic_similarity( + correct_answer_key="expected", + embedding_model="text-embedding-3-small", + threshold=0.8 +) +``` + +**Parameters:** +- **correct_answer_key** (optional): Test case field with expected value (default: "correct_answer") +- **embedding_model** (optional): OpenAI embedding model (default: "text-embedding-3-small") +- **threshold** (optional): Minimum similarity to pass (default: 0.5) + +**Returns:** +- `score`: Cosine similarity from 0.0 to 1.0 +- `success`: True if score meets threshold diff --git a/docs/docs/evaluation/evaluation-from-sdk/05-running-evaluations.mdx b/docs/docs/evaluation/evaluation-from-sdk/05-running-evaluations.mdx new file mode 100644 index 0000000000..5fea27788a --- /dev/null +++ b/docs/docs/evaluation/evaluation-from-sdk/05-running-evaluations.mdx @@ -0,0 +1,154 @@ +--- +title: "Running Evaluations" +sidebar_label: "Running Evaluations" +description: "Learn how to run evaluate LLM applications programmatically from the SDK" +sidebar_position: 5 +--- + +# Running Evaluations + +Once you have defined your testsets, applications, and evaluators, you can run evaluations using the `aevaluate()` function. This function executes your application on test data and scores the outputs using your evaluators. + +## Basic Usage + +The `aevaluate()` function requires three inputs: + +```python +from agenta.sdk.evaluations import aevaluate + +result = await aevaluate( + testsets=[testset.id], + applications=[my_application], + evaluators=[my_evaluator], +) +``` + +**Required Parameters:** + +- `testsets`: A list of testset IDs or testset data +- `applications`: A list of application functions or IDs +- `evaluators`: A list of evaluator functions or IDs + +The function runs each test case through your application and evaluates the output with all specified evaluators. + +## Passing Testsets + +You can provide testsets in two ways: + +**Using testset IDs:** + +```python +# Create a testset first +testset = await ag.testsets.acreate( + name="My Test Data", + data=[ + {"input": "Hello", "expected": "Hi"}, + {"input": "Goodbye", "expected": "Bye"}, + ], +) + +# Use the ID in aevaluate +result = await aevaluate( + testsets=[testset.id], + applications=[my_app], + evaluators=[my_eval], +) +``` + +**Using inline data:** + +```python +# Pass test data directly +result = await aevaluate( + testsets=[ + [ + {"input": "Hello", "expected": "Hi"}, + {"input": "Goodbye", "expected": "Bye"}, + ] + ], + applications=[my_app], + evaluators=[my_eval], +) +``` + +When you pass inline data, Agenta automatically creates a testset for you. + +## Running Multiple Evaluators + +You can run several evaluators on the same application in a single evaluation: + +```python +result = await aevaluate( + testsets=[testset.id], + applications=[my_application], + evaluators=[ + exact_match, + fuzzy_match, + llm_judge, + ], +) +``` + +## Adding Names and Descriptions + +You can add metadata to make your evaluations easier to identify: + +```python +result = await aevaluate( + name="Product Launch Evaluation", + description="Testing the new response format against our quality criteria", + testsets=[testset.id], + applications=[my_application], + evaluators=[my_evaluator], +) +``` + +The name and description appear in the Agenta UI and help you track evaluations over time. + + + +## Complete Example + +Here's a full evaluation workflow: + +```python +import agenta as ag +from agenta.sdk.evaluations import aevaluate + +# Initialize +ag.init() + +# Define application +@ag.application(slug="greeting_app") +async def greeting_app(message: str): + return f"Hello, {message}!" + +# Define evaluator +@ag.evaluator(slug="length_check") +async def length_check(outputs: str): + return { + "score": len(outputs), + "success": len(outputs) < 50, + } + +# Create testset +testset = await ag.testsets.acreate( + name="Greeting Tests", + data=[ + {"message": "Alice"}, + {"message": "Bob"}, + {"message": "Charlie"}, + ], +) + +# Run evaluation +result = await aevaluate( + name="Greeting App Test", + testsets=[testset.id], + applications=[greeting_app], + evaluators=[length_check], +) + +print(f"Evaluation complete! Run ID: {result['run'].id}") +``` + diff --git a/docs/docs/evaluation/evaluation-from-sdk/_category_.json b/docs/docs/evaluation/evaluation-from-sdk/_category_.json new file mode 100644 index 0000000000..cc211724c0 --- /dev/null +++ b/docs/docs/evaluation/evaluation-from-sdk/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Evaluation from SDK", + "position": 7, + "collapsible": true, + "collapsed": true +} \ No newline at end of file diff --git a/docs/docs/tutorials/cookbooks/01-capture-user-feedback.mdx b/docs/docs/tutorials/cookbooks/01-capture-user-feedback.mdx index 1633d97b31..789dda4e44 100644 --- a/docs/docs/tutorials/cookbooks/01-capture-user-feedback.mdx +++ b/docs/docs/tutorials/cookbooks/01-capture-user-feedback.mdx @@ -18,7 +18,7 @@ In this tutorial, we'll build a simple LLM application and learn how to capture - View this feedback in the Agenta UI - + Open in Google Colaboratory diff --git a/docs/docs/tutorials/cookbooks/02-observability_langchain.mdx b/docs/docs/tutorials/cookbooks/02-observability_langchain.mdx index 5a0bc75291..06d943840e 100644 --- a/docs/docs/tutorials/cookbooks/02-observability_langchain.mdx +++ b/docs/docs/tutorials/cookbooks/02-observability_langchain.mdx @@ -18,7 +18,7 @@ This guide shows you how to set up tracing for a RAG application in Langchain us Tracing allows us to debug effectively complex LLM applications. It allows us to view exact prompts sent and contexts retrieved. - + Open in Google Colaboratory diff --git a/examples/jupyter/evaluation/quick-start.ipynb b/examples/jupyter/evaluation/quick-start.ipynb new file mode 100644 index 0000000000..9329543cb4 --- /dev/null +++ b/examples/jupyter/evaluation/quick-start.ipynb @@ -0,0 +1,529 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Agenta SDK Quick Start - Evaluations\n", + "\n", + "This notebook demonstrates how to:\n", + "1. Create a simple application that returns country capitals\n", + "2. Create evaluators to check if the application's output is correct\n", + "3. Run an evaluation to test your application\n", + "\n", + "The entire example takes less than 100 lines of code!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "First, install the Agenta SDK and set up your environment variables:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "# Install Agenta SDK\n", + "%pip install agenta -q" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Environment configured!\n" + ] + } + ], + "source": [ + "import os\n", + "from getpass import getpass\n", + "\n", + "# Set your API credentials\n", + "if not os.getenv(\"AGENTA_API_KEY\"):\n", + " os.environ[\"AGENTA_API_KEY\"] = getpass(\"Enter your Agenta API key: \")\n", + "\n", + "if not os.getenv(\"AGENTA_HOST\"):\n", + " os.environ[\"AGENTA_HOST\"] = \"https://cloud.agenta.ai\" # Change for self-hosted\n", + "\n", + "# Set OpenAI API key (required for LLM-as-a-judge evaluator)\n", + "if not os.getenv(\"OPENAI_API_KEY\"):\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI API key: \")\n", + "\n", + "print(\"✅ Environment configured!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Agenta SDK\n", + "\n", + "Initialize the SDK to connect to the Agenta platform:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2025-11-12T13:22:23.599Z \u001b[38;5;70m[INFO.]\u001b[0m Agenta - SDK ver: 0.62.1 \u001b[38;5;245m[agenta.sdk.agenta_init]\u001b[0m \n", + "2025-11-12T13:22:23.600Z \u001b[38;5;70m[INFO.]\u001b[0m Agenta - API URL: https://cloud.agenta.ai/api \u001b[38;5;245m[agenta.sdk.agenta_init]\u001b[0m \n", + "2025-11-12T13:22:23.600Z \u001b[38;5;70m[INFO.]\u001b[0m Agenta - OLTP URL: https://cloud.agenta.ai/api/otlp/v1/traces \u001b[38;5;245m[agenta.sdk.tracing.tracing]\u001b[0m \n", + "✅ Agenta SDK initialized!\n" + ] + } + ], + "source": [ + "import agenta as ag\n", + "\n", + "ag.init()\n", + "\n", + "print(\"✅ Agenta SDK initialized!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Define Your Application\n", + "\n", + "An application is any function decorated with `@ag.application`. It receives inputs from test data and returns outputs.\n", + "\n", + "Let's create a simple application that returns country capitals:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Application defined!\n" + ] + } + ], + "source": [ + "@ag.application(\n", + " slug=\"capital_finder\",\n", + " name=\"Capital Finder\",\n", + " description=\"Returns the capital of a given country\"\n", + ")\n", + "async def capital_finder(country: str):\n", + " \"\"\"\n", + " A simple application that returns country capitals.\n", + " \n", + " Args:\n", + " country: The country name (from testcase)\n", + " \n", + " Returns:\n", + " The capital city name\n", + " \"\"\"\n", + " capitals = {\n", + " \"Germany\": \"Berlin\",\n", + " \"France\": \"Paris\",\n", + " \"Spain\": \"Madrid\",\n", + " \"Italy\": \"Rome\",\n", + " }\n", + " return capitals.get(country, \"Unknown\")\n", + "\n", + "print(\"✅ Application defined!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Create Custom Evaluators\n", + "\n", + "Evaluators check if your application's output is correct. They receive:\n", + "- Fields from your testcase (e.g., `capital`)\n", + "- The application's output (always called `outputs`)\n", + "\n", + "Let's create two evaluators:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Evaluators defined!\n" + ] + } + ], + "source": [ + "@ag.evaluator(\n", + " slug=\"exact_match\",\n", + " name=\"Exact Match Evaluator\",\n", + " description=\"Checks if the output exactly matches the expected answer\"\n", + ")\n", + "async def exact_match(capital: str, outputs: str):\n", + " \"\"\"\n", + " Evaluates if the application's output matches the expected answer.\n", + " \n", + " Args:\n", + " capital: The expected capital (from testcase)\n", + " outputs: What the application returned\n", + " \n", + " Returns:\n", + " Dictionary with score and success flag\n", + " \"\"\"\n", + " is_correct = outputs == capital\n", + " return {\n", + " \"score\": 1.0 if is_correct else 0.0,\n", + " \"success\": is_correct,\n", + " }\n", + "\n", + "\n", + "@ag.evaluator(\n", + " slug=\"case_insensitive_match\",\n", + " name=\"Case Insensitive Match\",\n", + " description=\"Checks if output matches ignoring case\"\n", + ")\n", + "async def case_insensitive_match(capital: str, outputs: str):\n", + " \"\"\"\n", + " Evaluates with case-insensitive comparison.\n", + " \"\"\"\n", + " is_correct = outputs.lower() == capital.lower()\n", + " return {\n", + " \"score\": 1.0 if is_correct else 0.0,\n", + " \"success\": is_correct,\n", + " }\n", + "\n", + "print(\"✅ Evaluators defined!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Use Built-in Evaluators\n", + "\n", + "Agenta provides built-in evaluators like LLM-as-a-judge. Let's create one:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ LLM judge evaluator created!\n" + ] + } + ], + "source": [ + "from agenta.sdk.workflows import builtin\n", + "\n", + "llm_judge = builtin.auto_ai_critique(\n", + " slug=\"llm_judge\",\n", + " name=\"LLM Judge Evaluator\",\n", + " description=\"Uses an LLM to judge if the answer is correct\",\n", + " correct_answer_key=\"capital\",\n", + " model=\"gpt-4o-mini\",\n", + " prompt_template=[\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": \"You are a geography expert evaluating answers about world capitals.\",\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": (\n", + " \"Expected capital: {{capital}}\\n\"\n", + " \"Student's answer: {{outputs}}\\n\\n\"\n", + " \"Is the student's answer correct?\\n\"\n", + " \"Respond with ONLY a number from 0.0 (wrong) to 1.0 (correct).\\n\"\n", + " \"Nothing else - just the number.\"\n", + " ),\n", + " },\n", + " ],\n", + ")\n", + "\n", + "print(\"✅ LLM judge evaluator created!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Create Test Data\n", + "\n", + "Define test cases as a list of dictionaries:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Created 4 test cases\n" + ] + } + ], + "source": [ + "test_data = [\n", + " {\"country\": \"Germany\", \"capital\": \"Berlin\"},\n", + " {\"country\": \"France\", \"capital\": \"Paris\"},\n", + " {\"country\": \"Spain\", \"capital\": \"Madrid\"},\n", + " {\"country\": \"Italy\", \"capital\": \"Rome\"},\n", + "]\n", + "\n", + "print(f\"✅ Created {len(test_data)} test cases\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Run the Evaluation\n", + "\n", + "Now let's create a testset and run the evaluation!" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📝 Creating testset...\n", + "✅ Testset created with ID: 019a783b-7894-7c80-a5ce-25005d745f5f\n", + " Contains 4 test cases\n", + "\n" + ] + } + ], + "source": [ + "from agenta.sdk.evaluations import aevaluate\n", + "\n", + "# Create a testset\n", + "print(\"📝 Creating testset...\")\n", + "testset = await ag.testsets.acreate(\n", + " name=\"Country Capitals Quick Start\",\n", + " data=test_data,\n", + ")\n", + "\n", + "if not testset or not testset.id:\n", + " print(\"❌ Failed to create testset\")\n", + "else:\n", + " print(f\"✅ Testset created with ID: {testset.id}\")\n", + " print(f\" Contains {len(test_data)} test cases\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🚀 Running evaluation...\n", + "\n", + "\n", + "────────────────────────────────────────────────────────────────────────────\n", + "Evaluation running...\n", + "────────────────────────────────────────────────────────────────────────────\n", + "• run_id=019a783b-a7dd-7d93-a537-bf8bbd9ea102\n", + "├─ • testset_id=019a783b-7894-7c80-a5ce-25005d745f5f\n", + "│ │ -------------------------------------------------------------\n", + "│ ├─ • testcase_id=7aa9d79f-7868-5305-89fc-98fdc4d2f257\n", + "│ │ ├─ • scenario_id=019a783b-a9ca-7881-909c-769d5d982ac6\n", + "│ │ │ ├─ • result_id=019a783b-aa39-7901-9268-0fefdf22fc0d (testcase)\n", + "│ │ │ ├─ • result_id=019a783b-aab6-7d71-ae94-7de9cf001ec6 (invocation)\n", + "│ │ │ ├─ • result_id=019a783b-ab2a-7de3-b8fa-119c40f9b00e (annotation)\n", + "│ │ │ ├─ • result_id=019a783b-ab9a-7f83-938d-1b7dc5e908dc (annotation)\n", + "│ │ │ └─ • result_id=019a783b-b120-7b42-b846-6a28fc2f610f (annotation)\n", + "│ │ └─ • metrics_id=019a783b-b689-7901-b9b6-2be90de568c6\n", + "│ │ -------------------------------------------------------------\n", + "│ ├─ • testcase_id=a952230e-a234-5c22-8b53-90db8a64adf7\n", + "│ │ ├─ • scenario_id=019a783b-b6cb-7a90-9e64-39518978ce5a\n", + "│ │ │ ├─ • result_id=019a783b-b707-7ff0-8c9f-14b463dfa020 (testcase)\n", + "│ │ │ ├─ • result_id=019a783b-b747-7013-afd0-67f1042420c6 (invocation)\n", + "│ │ │ ├─ • result_id=019a783b-b7b9-7e22-9b13-9f9193b9be66 (annotation)\n", + "│ │ │ ├─ • result_id=019a783b-b833-72f3-b571-f21120e65bac (annotation)\n", + "│ │ │ └─ • result_id=019a783b-bd01-7653-b790-95bfcb704032 (annotation)\n", + "│ │ └─ • metrics_id=019a783b-c24c-7442-8e40-638a6e52f951\n", + "│ │ -------------------------------------------------------------\n", + "│ ├─ • testcase_id=cd5879dc-d650-5164-8f44-e9dc031ff080\n", + "│ │ ├─ • scenario_id=019a783b-c28d-7ea2-8162-7cc9a61fddcf\n", + "│ │ │ ├─ • result_id=019a783b-c2cf-7632-92e8-271356cedcf6 (testcase)\n", + "│ │ │ ├─ • result_id=019a783b-c30f-7821-b4e6-8a3e4e34bac8 (invocation)\n", + "│ │ │ ├─ • result_id=019a783b-c380-75d0-b24a-fd68e8ac6e56 (annotation)\n", + "│ │ │ ├─ • result_id=019a783b-c3f2-7191-a643-9ce62704cb09 (annotation)\n", + "│ │ │ └─ • result_id=019a783b-c8e2-7dd2-b99f-9e30740117be (annotation)\n", + "│ │ └─ • metrics_id=019a783b-ce3b-7cf3-b4c9-6575f6a02fbe\n", + "│ │ -------------------------------------------------------------\n", + "│ └─ • testcase_id=e7c0a205-d422-5dde-b014-66e6e060936c\n", + "│ ├─ • scenario_id=019a783b-ce7b-7e80-b309-3a740e122252\n", + "│ │ ├─ • result_id=019a783b-ceb8-7632-a0f5-c12d5c414462 (testcase)\n", + "│ │ ├─ • result_id=019a783b-cef7-7083-9686-92baf3457d8d (invocation)\n", + "│ │ ├─ • result_id=019a783b-cf6a-78a3-ab30-52e5d98a23da (annotation)\n", + "│ │ ├─ • result_id=019a783b-cfd9-74e0-a5ff-d12a9cd904cb (annotation)\n", + "│ │ └─ • result_id=019a783b-d1ad-7a13-a294-c1568a7a1f1f (annotation)\n", + "│ └─ • metrics_id=019a783b-d6dd-7463-8fd0-7d1d5f3a02d1\n", + "│ -------------------------------------------------------------\n", + "└─ • metrics_id=019a783b-d7bd-7c30-8ea5-462088b33f96\n", + "────────────────────────────────────────────────────────────────────────────\n", + "Evaluation finished.\n", + "----------------------------------------------------------------------------\n", + "Evaluation URL: https://cloud.agenta.ai/w/019a5aa9-7bf0-78d3-a9f6-29dfaddf23d3/p/019a5aa9-7c04-7ad0-aafc-0a1388797bc0/evaluations/results/019a783b-a7dd-7d93-a537-bf8bbd9ea102\n", + "────────────────────────────────────────────────────────────────────────────\n", + "\n", + "\n", + "======================================================================\n", + "✅ Evaluation Complete!\n", + "======================================================================\n" + ] + } + ], + "source": [ + "# Run evaluation with all three evaluators\n", + "print(\"🚀 Running evaluation...\\n\")\n", + "\n", + "result = await aevaluate(\n", + " name=\"My First Eval\",\n", + " description=\"Test cases for capital city questions\",\n", + " testsets=[testset.id],\n", + " applications=[capital_finder],\n", + " evaluators=[\n", + " exact_match,\n", + " case_insensitive_match,\n", + " llm_judge,\n", + " ],\n", + ")\n", + "\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"✅ Evaluation Complete!\")\n", + "print(\"=\" * 70)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## View Results\n", + "\n", + "The evaluation results are now available in the Agenta UI! You can:\n", + "\n", + "1. **View detailed results** - See how each test case performed\n", + "2. **Compare evaluators** - See which evaluators flagged which test cases\n", + "3. **Analyze metrics** - View aggregated scores and success rates\n", + "\n", + "You can also access results programmatically:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Understanding the Data Flow\n", + "\n", + "When you run an evaluation, here's what happens:\n", + "\n", + "1. **Testcase data flows to the application**\n", + " - Input: `{\"country\": \"Germany\", \"capital\": \"Berlin\"}`\n", + " - Application receives: `country=\"Germany\"`\n", + " - Application returns: `\"Berlin\"`\n", + "\n", + "2. **Both testcase data and application output flow to evaluators**\n", + " - Evaluator receives: `capital=\"Berlin\"` (from testcase)\n", + " - Evaluator receives: `outputs=\"Berlin\"` (from application)\n", + " - Evaluator compares and returns: `{\"score\": 1.0, \"success\": True}`\n", + "\n", + "3. **Results are stored in Agenta**\n", + " - View in web interface\n", + " - Access programmatically" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "Now that you've created your first evaluation, explore:\n", + "\n", + "- **[Configuring Evaluators](https://docs.agenta.ai/evaluation/evaluation-from-sdk/configuring-evaluators)** - Create custom scoring logic\n", + "- **[Managing Testsets](https://docs.agenta.ai/evaluation/evaluation-from-sdk/managing-testsets)** - Work with test data\n", + "- **[Running Evaluations](https://docs.agenta.ai/evaluation/evaluation-from-sdk/running-evaluations)** - Advanced evaluation patterns\n", + "\n", + "## Summary\n", + "\n", + "In this notebook, you learned how to:\n", + "\n", + "✅ Define an application with `@ag.application` \n", + "✅ Create custom evaluators with `@ag.evaluator` \n", + "✅ Use built-in evaluators like LLM-as-a-judge \n", + "✅ Create testsets with `ag.testsets.acreate()` \n", + "✅ Run evaluations with `aevaluate()` \n", + "✅ View results in the Agenta UI \n", + "\n", + "Happy evaluating! 🎉" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/jupyter/evaluation/testset-management.ipynb b/examples/jupyter/evaluation/testset-management.ipynb new file mode 100644 index 0000000000..8e1a9d6312 --- /dev/null +++ b/examples/jupyter/evaluation/testset-management.ipynb @@ -0,0 +1,496 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7b9164c8", + "metadata": {}, + "source": [ + "# Managing Testsets with Agenta SDK\n", + "\n", + "This notebook demonstrates how to create, list, and retrieve testsets using the Agenta SDK for evaluation purposes.\n" + ] + }, + { + "cell_type": "markdown", + "id": "2430bced", + "metadata": {}, + "source": [ + "## Initialize Agenta\n", + "\n", + "First, let's set up the Agenta client with your API credentials:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d102b221", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2025-10-23T16:46:06.701Z \u001b[38;5;70m[INFO.]\u001b[0m Agenta - SDK version: 0.51.2 \u001b[38;5;245m[agenta.sdk.agenta_init]\u001b[0m \n", + "2025-10-23T16:46:06.702Z \u001b[38;5;70m[INFO.]\u001b[0m Agenta - Host: http://144.76.237.122 \u001b[38;5;245m[agenta.sdk.agenta_init]\u001b[0m \n", + "2025-10-23T16:46:06.702Z \u001b[38;5;70m[INFO.]\u001b[0m Agenta - OLTP URL: http://144.76.237.122/api/otlp/v1/traces \u001b[38;5;245m[agenta.sdk.tracing.tracing]\u001b[0m \n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "os.environ[\"AGENTA_API_KEY\"] = \"\"\n", + "os.environ[\"AGENTA_HOST\"] = \"https://cloud.agenta.ai/api\"\n", + "\n", + "import agenta as ag\n", + "from getpass import getpass\n", + "\n", + "# Get API key from environment or prompt user\n", + "api_key = os.getenv(\"AGENTA_API_KEY\")\n", + "if not api_key:\n", + " os.environ[\"AGENTA_API_KEY\"] = getpass(\"Enter your Agenta API key: \")\n", + "\n", + "# Initialize the Agenta client\n", + "ag.init()" + ] + }, + { + "cell_type": "markdown", + "id": "60a6619e", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "First, let's import the necessary functions from our entities module:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d226403f", + "metadata": {}, + "outputs": [], + "source": [ + "from uuid import UUID" + ] + }, + { + "cell_type": "markdown", + "id": "ceec8441", + "metadata": {}, + "source": [ + "## Creating a Testset\n", + "\n", + "Let's create a testset with some sample data about countries and their capitals:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2b89655", + "metadata": {}, + "outputs": [], + "source": "# Create a testset with simple data\ntestset = await ag.testsets.acreate(\n data=[\n {\"country\": \"Germany\", \"capital\": \"Berlin\"},\n {\"country\": \"France\", \"capital\": \"Paris\"},\n {\"country\": \"Spain\", \"capital\": \"Madrid\"},\n {\"country\": \"Italy\", \"capital\": \"Rome\"},\n {\"country\": \"Japan\", \"capital\": \"Tokyo\"},\n ],\n name=\"Country Capitals\",\n)\n\nprint(f\"✅ Created testset with ID: {testset.id}\")\nprint(f\" Name: {testset.name}\")\nprint(f\" Slug: {testset.slug}\")\n\n# Save the ID for later use\ntestset_id = testset.id" + }, + { + "cell_type": "markdown", + "id": "852d13a8", + "metadata": {}, + "source": "**Expected Output:**\n```\n✅ Created testset with ID: 01963413-3d39-7650-80ce-3ad5d688da6c\n Name: Country Capitals\n Slug: 3ad5d688da6c\n```\n\nThe `acreate` function returns a `TestsetRevision` object with the following fields:\n- `id`: Unique UUID for the testset\n- `name`: The name you provided\n- `slug`: A shortened identifier\n- `data`: The test data in a structured format" + }, + { + "cell_type": "markdown", + "id": "ac02ab05", + "metadata": {}, + "source": [ + "## Listing All Testsets\n", + "\n", + "Now let's retrieve all testsets in our project:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b52e8ae2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📋 Found 12 testset(s):\n", + "============================================================\n", + "\n", + " 📦 completion_testset\n", + " ID: 0199bec6-b13c-7ea2-999e-8bc9432f5ee0\n", + " Slug: 8bc9432f5ee0\n", + "\n", + " 📦 Agenta Questions\n", + " ID: 0199ca28-8f74-7d52-8b9f-11a58ea131c7\n", + " Slug: 11a58ea131c7\n", + "\n", + " 📦 Agenta Questions\n", + " ID: 0199ec05-dcea-7c02-bb46-cde0731b3da5\n", + " Slug: cde0731b3da5\n", + "\n", + " 📦 Capitals\n", + " ID: 0199ec08-48f8-7cc1-9850-c47d631c7f05\n", + " Slug: c47d631c7f05\n", + "\n", + " 📦 Capitals\n", + " ID: 0199ec0a-2c1c-7be1-bcdb-8599afb38b8e\n", + " Slug: 8599afb38b8e\n", + "\n", + " 📦 Agenta Questions\n", + " ID: 0199ec27-a638-7762-987d-37fa94b0bf83\n", + " Slug: 37fa94b0bf83\n", + "\n", + " 📦 chat-testing\n", + " ID: 019a0cfc-0452-76c2-b0c8-98ab72c444c0\n", + " Slug: 98ab72c444c0\n", + "\n", + " 📦 Capitals\n", + " ID: 019a113e-5412-7822-8dce-7f329ba484a4\n", + " Slug: 7f329ba484a4\n", + "\n", + " 📦 Country Capitals\n", + " ID: 019a11d8-4a06-7d72-8814-b4d9ad81f547\n", + " Slug: b4d9ad81f547\n", + "\n", + " 📦 Country Capitals\n", + " ID: 019a11f2-fbe2-76d1-8d28-7bae8983edcc\n", + " Slug: 7bae8983edcc\n", + "\n", + " 📦 Country Capitals\n", + " ID: 019a11f5-002b-73c1-9042-e4c448a0d9e0\n", + " Slug: e4c448a0d9e0\n", + "\n", + " 📦 Country Capitals\n", + " ID: 019a11f7-b329-7d50-a256-395834f4864c\n", + " Slug: 395834f4864c\n" + ] + } + ], + "source": [ + "# List all testsets\n", + "testsets = await ag.testsets.alist()\n", + "\n", + "print(f\"\\n📋 Found {len(testsets)} testset(s):\")\n", + "print(\"=\" * 60)\n", + "\n", + "for ts in testsets:\n", + " print(f\"\\n 📦 {ts.name}\")\n", + " print(f\" ID: {ts.id}\")\n", + " print(f\" Slug: {ts.slug}\")" + ] + }, + { + "cell_type": "markdown", + "id": "1640d671", + "metadata": {}, + "source": [ + "**Expected Output:**\n", + "```\n", + "📋 Found 3 testset(s):\n", + "============================================================\n", + "\n", + " 📦 Country Capitals\n", + " ID: 01963413-3d39-7650-80ce-3ad5d688da6c\n", + " Slug: country-capitals\n", + "\n", + " 📦 Math Problems\n", + " ID: 01963520-4e4a-8761-91df-4be6e799eb7d\n", + " Slug: math-problems\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "0cdcd188", + "metadata": {}, + "source": [ + "## Retrieving a Testset by ID\n", + "\n", + "Let's retrieve a specific testset using its ID:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "752d9ad6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "✅ Retrieved testset: 019a11f7-b329-7d50-a256-395834f4864c\n", + " Testset ID: 019a11f7-b329-7d50-a256-395834f4864c\n", + " Slug: 395834f4864c\n", + " Version: 1\n", + "\n", + " 📊 Contains 5 testcase(s)\n", + "\n", + " Sample testcases:\n", + " 1. {'capital': 'Berlin', 'country': 'Germany'}\n", + " 2. {'capital': 'Paris', 'country': 'France'}\n", + " 3. {'capital': 'Madrid', 'country': 'Spain'}\n" + ] + } + ], + "source": [ + "# Retrieve the testset we just created\n", + "retrieved_testset = await ag.testsets.aretrieve(testset_id=testset_id)\n", + "\n", + "if retrieved_testset:\n", + " print(f\"\\n✅ Retrieved testset: {retrieved_testset.id}\")\n", + " print(f\" Testset ID: {retrieved_testset.testset_id}\")\n", + " print(f\" Slug: {retrieved_testset.slug}\")\n", + " print(f\" Version: {retrieved_testset.version}\")\n", + "\n", + " # Access the testcases\n", + " if retrieved_testset.data and retrieved_testset.data.testcases:\n", + " print(f\"\\n 📊 Contains {len(retrieved_testset.data.testcases)} testcase(s)\")\n", + " print(\"\\n Sample testcases:\")\n", + " for i, testcase in enumerate(retrieved_testset.data.testcases[:3], 1):\n", + " print(f\" {i}. {testcase.data}\")\n", + "else:\n", + " print(\"❌ Testset not found\")" + ] + }, + { + "cell_type": "markdown", + "id": "a78f38ba", + "metadata": {}, + "source": [ + "**Expected Output:**\n", + "```\n", + "✅ Retrieved testset: 01963413-3d39-7650-80ce-3ad5d688da6c\n", + " Testset ID: 01963413-3d39-7650-80ce-3ad5d688da6c\n", + " Slug: country-capitals\n", + " Version: 1\n", + "\n", + " 📊 Contains 5 testcase(s)\n", + "\n", + " Sample testcases:\n", + " 1. {'country': 'Germany', 'capital': 'Berlin'}\n", + " 2. {'country': 'France', 'capital': 'Paris'}\n", + " 3. {'country': 'Spain', 'capital': 'Madrid'}\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "19e8dc07", + "metadata": {}, + "source": [ + "## Retrieving a Testset by Name\n", + "\n", + "You can find a testset by name by filtering the results from `get_testsets`:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71ea54d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "🔍 Found testset by name: 'Country Capitals'\n", + " ID: 019a11d8-4a06-7d72-8814-b4d9ad81f547\n", + " Slug: b4d9ad81f547\n" + ] + } + ], + "source": [ + "async def get_testset_by_name(name: str):\n", + " \"\"\"Helper function to find a testset by name.\"\"\"\n", + " testsets = await ag.testsets.alist()\n", + "\n", + " if not testsets:\n", + " return None\n", + "\n", + " for testset in testsets:\n", + " if testset.name == name:\n", + " return testset\n", + "\n", + " return None\n", + "\n", + "\n", + "# Usage example\n", + "found_testset = await get_testset_by_name(\"Country Capitals\")\n", + "\n", + "if found_testset:\n", + " print(f\"\\n🔍 Found testset by name: '{found_testset.name}'\")\n", + " print(f\" ID: {found_testset.id}\")\n", + " print(f\" Slug: {found_testset.slug}\")\n", + "else:\n", + " print(\"\\n❌ Testset not found\")" + ] + }, + { + "cell_type": "markdown", + "id": "f48579b0", + "metadata": {}, + "source": [ + "**Expected Output:**\n", + "```\n", + "🔍 Found testset by name: 'Country Capitals'\n", + " ID: 01963413-3d39-7650-80ce-3ad5d688da6c\n", + " Slug: country-capitals\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "a8e8a8fb", + "metadata": {}, + "source": [ + "## Working with Test Data\n", + "\n", + "Once you have a testset, you can iterate through its testcases for evaluation:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fad427d9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "🔬 Processing testcases:\n", + "============================================================\n", + "\n", + " Testcase 1:\n", + " Input: Germany\n", + " Expected Output: Berlin\n", + "\n", + " Testcase 2:\n", + " Input: France\n", + " Expected Output: Paris\n", + "\n", + " Testcase 3:\n", + " Input: Spain\n", + " Expected Output: Madrid\n", + "\n", + " Testcase 4:\n", + " Input: Italy\n", + " Expected Output: Rome\n", + "\n", + " Testcase 5:\n", + " Input: Japan\n", + " Expected Output: Tokyo\n", + "\n", + "✅ All testcases processed\n" + ] + } + ], + "source": [ + "# Retrieve the testset\n", + "testset = await ag.testsets.aretrieve(testset_id=testset_id)\n", + "\n", + "if testset and testset.data and testset.data.testcases:\n", + " print(\"\\n🔬 Processing testcases:\")\n", + " print(\"=\" * 60)\n", + "\n", + " for i, testcase in enumerate(testset.data.testcases, 1):\n", + " country = testcase.data.get(\"country\")\n", + " capital = testcase.data.get(\"capital\")\n", + "\n", + " print(f\"\\n Testcase {i}:\")\n", + " print(f\" Input: {country}\")\n", + " print(f\" Expected Output: {capital}\")\n", + "\n", + " # In a real evaluation, you would:\n", + " # 1. Pass the input to your LLM application\n", + " # 2. Compare the output with the expected result\n", + " # 3. Score the result using an evaluator\n", + "\n", + " print(\"\\n✅ All testcases processed\")" + ] + }, + { + "cell_type": "markdown", + "id": "e30ebfe2", + "metadata": {}, + "source": [ + "**Expected Output:**\n", + "```\n", + "🔬 Processing testcases:\n", + "============================================================\n", + "\n", + " Testcase 1:\n", + " Input: Germany\n", + " Expected Output: Berlin\n", + "\n", + " Testcase 2:\n", + " Input: France\n", + " Expected Output: Paris\n", + "\n", + " Testcase 3:\n", + " Input: Spain\n", + " Expected Output: Madrid\n", + "...\n", + "\n", + "✅ All testcases processed\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "ab7a8db7", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "In this notebook, we've covered:\n", + "\n", + "1. **Creating testsets** with `ag.testsets.acreate()` - Pass simple dictionaries of test data\n", + "2. **Listing testsets** with `ag.testsets.alist()` - Get all testsets in your project\n", + "3. **Retrieving by ID** with `ag.testsets.aretrieve()` - Get a specific testset with all its data\n", + "4. **Finding by name** - Use a helper pattern to filter testsets by name\n", + "5. **Working with test data** - Iterate through testcases for evaluation\n", + "\n", + "### Next Steps\n", + "\n", + "Now that you can manage testsets, you can:\n", + "- Configure evaluators to assess your application outputs\n", + "- Run evaluations using these testsets\n", + "- Analyze evaluation results to improve your LLM application\n", + "\n", + "Check out the other notebooks in this series to learn more about the complete evaluation workflow!\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/examples/jupyter/evaluations_with_sdk.ipynb b/examples/jupyter/evaluations_with_sdk.ipynb deleted file mode 100644 index 74e3d9ff9e..0000000000 --- a/examples/jupyter/evaluations_with_sdk.ipynb +++ /dev/null @@ -1,279 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Using evaluations with the SDK\n", - "In this cookbook we will show how to interact with evaluation in agenta programatically. Either using the SDK (or the raw API). \n", - "\n", - "We will do the following:\n", - "\n", - "- Create a testset\n", - "- Create and configure an evaluator\n", - "- Run an evaluation\n", - "- Retrieve the results of evaluations\n", - "\n", - "We assume that you have already created an LLM application and variants in agenta. \n", - "\n", - "\n", - "### Architectural Overview:\n", - "In this scenario, evaluations are executed on the Agenta backend. Specifically, Agenta invokes the LLM application for each row in the testset and subsequently processes the output using the designated evaluator. \n", - "This operation is managed through Celery tasks. The interactions with the LLM application are asynchronous, batched, and include retry mechanisms. Additionally, the batching configuration can be adjusted to avoid exceeding the rate limits imposed by the LLM provider.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! pip install -U agenta" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configuration Setup\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Assuming an application has already been created through the user interface, you will need to obtain the application ID.\n", - "# In this example we will use the default template single_prompt which has the prompt \"Determine the capital of {country}\"\n", - "\n", - "# You can find the application ID in the URL. For example, in the URL https://cloud.agenta.ai/apps/666dde95962bbaffdb0072b5/playground?variant=app.default, the application ID is `666dde95962bbaffdb0072b5`.\n", - "from agenta.client.client import AgentaApi\n", - "\n", - "# Let's list the applications\n", - "client.apps.list_apps()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "app_id = \"667d8cfad1812781f7e375d9\"\n", - "\n", - "# You can create the API key under the settings page. If you are using the OSS version, you should keep this as an empty string\n", - "api_key = \"EUqJGOUu.xxxx\"\n", - "\n", - "# Host.\n", - "host = \"https://cloud.agenta.ai\"\n", - "\n", - "# Initialize the client\n", - "\n", - "client = AgentaApi(base_url=host + \"/api\", api_key=api_key)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create a testset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from agenta.client.types.new_testset import NewTestset\n", - "\n", - "csvdata = [\n", - " {\"country\": \"france\", \"capital\": \"Paris\"},\n", - " {\"country\": \"Germany\", \"capital\": \"paris\"},\n", - "]\n", - "\n", - "response = client.testsets.create_testset(\n", - " request=NewTestset(name=\"testset\", csvdata=csvdata)\n", - ")\n", - "testset_id = response.id\n", - "\n", - "# let's now update it\n", - "\n", - "csvdata = [\n", - " {\"country\": \"france\", \"capital\": \"Paris\"},\n", - " {\"country\": \"Germany\", \"capital\": \"Berlin\"},\n", - "]\n", - "\n", - "client.testsets.update_testset(\n", - " testset_id=testset_id, request=NewTestset(name=\"testset\", csvdata=csvdata)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create evaluators" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create an evaluator that performs an exact match comparison on the 'capital' column\n", - "# You can find the list of evaluator keys and evaluators and their configurations in https://github.com/Agenta-AI/agenta/blob/main/agenta-backend/agenta_backend/resources/evaluators/evaluators.py\n", - "response = client.evaluators.create_new_evaluator_config(\n", - " app_id=app_id,\n", - " name=\"capital_evaluator\",\n", - " evaluator_key=\"auto_exact_match\",\n", - " settings_values={\"correct_answer_key\": \"capital\"},\n", - ")\n", - "exact_match_eval_id = response.id\n", - "\n", - "code_snippet = \"\"\"\n", - "from typing import Dict\n", - "\n", - "def evaluate(\n", - " app_params: Dict[str, str],\n", - " inputs: Dict[str, str],\n", - " output: str, # output of the llm app\n", - " datapoint: Dict[str, str] # contains the testset row \n", - ") -> float:\n", - " if output and output[0].isupper():\n", - " return 1.0\n", - " else:\n", - " return 0.0\n", - "\"\"\"\n", - "\n", - "response = client.evaluators.create_new_evaluator_config(\n", - " app_id=app_id,\n", - " name=\"capital_letter_evaluator\",\n", - " evaluator_key=\"auto_custom_code_run\",\n", - " settings_values={\"code\": code_snippet},\n", - ")\n", - "letter_match_eval_id = response.id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get list of all evaluators\n", - "client.evaluators.get_evaluator_configs(app_id=app_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Run an evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "response = client.apps.list_app_variants(app_id=app_id)\n", - "print(response)\n", - "myvariant_id = response[0].variant_id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run an evaluation\n", - "from agenta.client.types.llm_run_rate_limit import LlmRunRateLimit\n", - "\n", - "response = client.evaluations.create_evaluation(\n", - " app_id=app_id,\n", - " variant_ids=[myvariant_id],\n", - " testset_id=testsetid,\n", - " evaluators_configs=[exact_match_eval_id, letter_match_eval_id],\n", - " rate_limit=LlmRunRateLimit(\n", - " batch_size=10, # number of rows to call in parallel\n", - " max_retries=3, # max number of time to retry a failed llm call\n", - " retry_delay=2, # delay before retrying a failed llm call\n", - " delay_between_batches=5, # delay between batches\n", - " ),\n", - ")\n", - "print(response)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# check the status\n", - "client.evaluations.fetch_evaluation_status(\"667d98fbd1812781f7e3761a\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# fetch the overall results\n", - "response = client.evaluations.fetch_evaluation_results(\"667d98fbd1812781f7e3761a\")\n", - "\n", - "results = [\n", - " (evaluator[\"evaluator_config\"][\"name\"], evaluator[\"result\"])\n", - " for evaluator in response[\"results\"]\n", - "]\n", - "# End of Selection" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# fetch the detailed results\n", - "client.evaluations.fetch_evaluation_scenarios(\n", - " evaluations_ids=\"667d98fbd1812781f7e3761a\"\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/jupyter/capture_user_feedback.ipynb b/examples/jupyter/observability/capture_user_feedback.ipynb similarity index 100% rename from examples/jupyter/capture_user_feedback.ipynb rename to examples/jupyter/observability/capture_user_feedback.ipynb diff --git a/examples/jupyter/observability_langchain.ipynb b/examples/jupyter/observability/observability_langchain.ipynb similarity index 100% rename from examples/jupyter/observability_langchain.ipynb rename to examples/jupyter/observability/observability_langchain.ipynb