AutoEvaluator is a powerful Python library that accelerates LLM output quality control through automated evaluation. Using LLMs to evaluate LLMs, it provides a simple, transparent, and developer-friendly API to identify True Positives (TP), False Positives (FP), and False Negatives (FN) in generated content against ground truth.
- Automated Evaluation: Compare LLM outputs against ground truth with precision
- Multi-Provider Support: Works with AWS Bedrock, OpenAI, Anthropic, and Google Gemini
- Comprehensive Metrics: Automatically calculates Precision, Recall, and F1 Score
- Async-First Design: Built for high-performance concurrent evaluations
- Structured Outputs: Leverages Instructor for type-safe, validated responses
- Sentence-Level Granularity: Evaluates claims at the sentence level for detailed insights
- Python 3.9 or higher
- An API key for at least one supported LLM provider
pip install autoevaluatorgit clone https://github.com/yourusername/autoevaluator.git
cd autoevaluator
pip install -e .import asyncio
from dotenv import load_dotenv
load_dotenv() # Load env variables BEFORE importing autoevaluator
from autoevaluator import evaluate, get_instructor_client
async def main():
# Setup client for your preferred provider
client = get_instructor_client(provider="openai", model="gpt-4o-mini")
# Define the claim to evaluate
claim = "Feynman was born in 1918 in Malaysia"
# Define the ground truth
ground_truth = "Feynman was born in 1918 in America."
# Evaluate the claim
result = await evaluate(
claim=claim,
ground_truth=ground_truth,
client=client,
model_name="gpt-4o-mini"
)
print(result)
# Run the async function
asyncio.run(main())Output:
{
'TP': ['Feynman was born in 1918.'],
'FP': ['Feynman was born in Malaysia.'],
'FN': ['Feynman was born in America.'],
'precision': 0.5,
'recall': 0.5,
'f1_score': 0.5
}AutoEvaluator supports multiple LLM providers out of the box:
| Provider | Models | Environment Variables |
|---|---|---|
| AWS Bedrock | Claude Sonnet 4.5 | AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION |
| OpenAI | GPT-4o, GPT-4o-mini, etc. | OPENAI_API_KEY |
| Anthropic | Claude Sonnet 4, etc. | ANTHROPIC_API_KEY |
| Google Gemini | Gemini 2.0 Flash, etc. | GOOGLE_API_KEY |
Create a .env file in your project root:
# OpenAI
OPENAI_API_KEY=your_openai_api_key
# AWS Bedrock
AWS_ACCESS_KEY_ID=your_aws_access_key
AWS_SECRET_ACCESS_KEY=your_aws_secret_key
AWS_REGION=ap-southeast-1
# Anthropic
ANTHROPIC_API_KEY=your_anthropic_api_key
# Google Gemini
GOOGLE_API_KEY=your_google_api_keyimport os
# Set environment variables programmatically
os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
os.environ["AWS_ACCESS_KEY_ID"] = "your_aws_access_key"
os.environ["AWS_SECRET_ACCESS_KEY"] = "your_aws_secret_key"import asyncio
from dotenv import load_dotenv
load_dotenv() # Load env variables BEFORE importing autoevaluator
from autoevaluator import evaluate, get_instructor_client
async def evaluate_with_openai():
client = get_instructor_client(provider="openai", model="gpt-4o-mini")
claim = "The Earth is flat and the moon landing was in 1969."
ground_truth = "The Earth is round. The moon landing was in 1969."
result = await evaluate(claim, ground_truth, client=client, model_name="gpt-4o-mini")
print(f"True Positives: {result['TP']}")
print(f"False Positives: {result['FP']}")
print(f"False Negatives: {result['FN']}")
print(f"Precision: {result['precision']:.2f}")
print(f"Recall: {result['recall']:.2f}")
print(f"F1 Score: {result['f1_score']:.2f}")
asyncio.run(evaluate_with_openai())import asyncio
from dotenv import load_dotenv
load_dotenv() # Load env variables BEFORE importing autoevaluator
from autoevaluator import evaluate, get_instructor_client
async def evaluate_with_bedrock():
client = get_instructor_client(provider="bedrock")
claim = "Python was created by Guido van Rossum in 1991."
ground_truth = "Python was created by Guido van Rossum in 1991."
result = await evaluate(claim, ground_truth, client=client, model_name="bedrock-claude")
return result
result = asyncio.run(evaluate_with_bedrock())
print(f"Perfect match! F1 Score: {result['f1_score']}")import asyncio
from dotenv import load_dotenv
load_dotenv() # Load env variables BEFORE importing autoevaluator
from autoevaluator import evaluate, get_instructor_client
async def evaluate_with_anthropic():
client = get_instructor_client(
provider="anthropic",
model="claude-sonnet-4-20250514"
)
claim = "Water boils at 100Β°C at sea level."
ground_truth = "Water boils at 100Β°C at sea level."
result = await evaluate(claim, ground_truth, client=client, model_name="claude-sonnet-4-20250514")
return result
result = asyncio.run(evaluate_with_anthropic())import asyncio
from dotenv import load_dotenv
load_dotenv() # Load env variables BEFORE importing autoevaluator
from autoevaluator import evaluate, get_instructor_client
async def evaluate_with_gemini():
client = get_instructor_client(
provider="gemini",
model="gemini-2.0-flash-exp"
)
claim = "The speed of light is approximately 300,000 km/s in a vacuum."
ground_truth = "The speed of light is approximately 300,000 kilometers per second in a vacuum."
result = await evaluate(
claim=claim,
ground_truth=ground_truth,
client=client,
model_name="gemini-2.0-flash-exp"
)
print(f"True Positives: {result['TP']}")
print(f"False Positives: {result['FP']}")
print(f"False Negatives: {result['FN']}")
print(f"F1 Score: {result['f1_score']:.2f}")
return result
result = asyncio.run(evaluate_with_gemini())import asyncio
from dotenv import load_dotenv
load_dotenv() # Load env variables BEFORE importing autoevaluator
from autoevaluator import evaluate, get_instructor_client
async def batch_evaluate():
client = get_instructor_client(provider="openai", model="gpt-4o-mini")
test_cases = [
{
"claim": "Einstein developed the theory of relativity.",
"ground_truth": "Einstein developed the theory of relativity."
},
{
"claim": "The capital of France is London.",
"ground_truth": "The capital of France is Paris."
},
{
"claim": "Water is composed of hydrogen and oxygen.",
"ground_truth": "Water is composed of hydrogen and oxygen."
}
]
tasks = [
evaluate(tc["claim"], tc["ground_truth"], client=client, model_name="gpt-4o-mini")
for tc in test_cases
]
results = await asyncio.gather(*tasks)
for i, result in enumerate(results, 1):
print(f"\n--- Test Case {i} ---")
print(f"F1 Score: {result['f1_score']:.2f}")
print(f"Precision: {result['precision']:.2f}")
print(f"Recall: {result['recall']:.2f}")
asyncio.run(batch_evaluate())Evaluates a claim against ground truth and returns detailed metrics.
async def evaluate(
claim: str,
ground_truth: str,
client: instructor.AsyncInstructor,
model_name: str = "gpt-4o-mini"
) -> Dict[str, Any]Parameters:
claim(str): The text to be evaluatedground_truth(str): The reference text to compare againstclient(instructor.AsyncInstructor): Instructor-wrapped async clientmodel_name(str): Model identifier to use
Returns:
Dictionary containing:
TP(List[str]): List of true positive sentencesFP(List[str]): List of false positive sentencesFN(List[str]): List of false negative sentencesprecision(float): Precision score (0.0 to 1.0)recall(float): Recall score (0.0 to 1.0)f1_score(float): F1 score (0.0 to 1.0)
Creates an Instructor-wrapped client for the specified LLM provider.
def get_instructor_client(
provider: Literal["bedrock", "openai", "anthropic", "gemini"] = "bedrock",
model: Optional[str] = None,
api_key: Optional[str] = None,
mode: instructor.Mode = instructor.Mode.JSON,
**kwargs
) -> instructor.AsyncInstructorParameters:
provider(str): LLM provider to use ("bedrock", "openai", "anthropic", "gemini")model(Optional[str]): Model name (uses provider default if None)api_key(Optional[str]): API key (falls back to environment variables)mode(instructor.Mode): Instructor parsing mode**kwargs: Additional provider-specific arguments
Returns:
An Instructor-wrapped async client ready for use.
Breaks down complex text into simple, single-clause sentences.
async def text_simplifier(
text: str,
model_name: str,
client: instructor.AsyncInstructor
) -> TextSimplifyAutoEvaluator uses a sophisticated multi-step process to evaluate claims:
- Text Simplification: Complex sentences are broken down into simple, atomic claims
- Question Generation: Each simplified sentence is converted into a fact-checking question
- Bidirectional Verification: Questions are checked against both the claim and ground truth
- Classification: Sentences are classified as TP, FP, or FN based on verification results
- Metrics Calculation: Precision, Recall, and F1 scores are computed from the classifications
Input Claim & Ground Truth
β
Text Simplifier (breaks into atomic sentences)
β
Question Generator (creates fact-check questions)
β
Question Checker (verifies against ground truth)
β
Classification (TP/FP/FN assignment)
β
Metrics Calculation (Precision, Recall, F1)
β
Structured Output
from autoevaluator import text_simplifier, get_instructor_client
async def simplify_text():
client = get_instructor_client(provider="openai")
complex_text = """Although the weather was bad and it was raining heavily,
we decided to go hiking because we had planned it for weeks."""
result = await text_simplifier(
text=complex_text,
model_name="gpt-4o-mini",
client=client
)
print("Simplified sentences:")
for sentence in result.simplified_sentences:
print(f"- {sentence}")
asyncio.run(simplify_text())from autoevaluator.client import (
get_openai_instructor_client,
get_bedrock_instructor_client,
get_anthropic_instructor_client,
get_gemini_instructor_client
)
# OpenAI
openai_client = get_openai_instructor_client(model="gpt-4o")
# Bedrock
bedrock_client = get_bedrock_instructor_client()
# Anthropic
anthropic_client = get_anthropic_instructor_client()
# Gemini
gemini_client = get_gemini_instructor_client(model="gemini-2.0-flash")import asyncio
from dotenv import load_dotenv
load_dotenv() # Load env variables BEFORE importing autoevaluator
from autoevaluator import evaluate, get_instructor_client
async def safe_evaluate():
try:
client = get_instructor_client(provider="openai")
result = await evaluate(
claim="Some claim",
ground_truth="Some truth",
client=client,
model_name="gpt-4o-mini"
)
return result
except ValueError as e:
print(f"Configuration error: {e}")
except Exception as e:
print(f"Evaluation error: {e}")
asyncio.run(safe_evaluate())- Async by Default: All operations are asynchronous for better performance
- Batch Processing: Use
asyncio.gather()for concurrent evaluations - Rate Limiting: Be mindful of provider rate limits when running batch evaluations
- Caching: Consider caching results for repeated evaluations
Contributions are welcome! Please follow these steps:
- Fork the repository
- Create a feature branch (
git checkout -b feature/amazing-feature) - Commit your changes (
git commit -m 'Add amazing feature') - Push to the branch (
git push origin feature/amazing-feature) - Open a Pull Request
This project is licensed under the MIT License. See the LICENSE file for details.
- Built with Instructor for structured outputs
- Supports multiple LLM providers through unified interfaces
- Inspired by the need for automated, reliable LLM evaluation
Darveen Vijayan
- LinkedIn: darveenvijayan
- Twitter: @DarveenVijayan
- Medium: LLMs: A Calculator for Words
- Multi-provider support (OpenAI, Bedrock, Anthropic, Gemini)
- Async-first architecture
- Improved text simplification
- Enhanced error handling
Made with β€οΈ by Darveen Vijayan