nasa · cli620 · Feb 13, 2026 · Feb 17, 2026 · Feb 17, 2026 · Feb 20, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,11 @@ dependencies = [
     "langfuse==3.10.6",
     "instructor>=1.13.0",
     "jsonschema>=4.17.3",
+    "ragas>=0.4.3",
+    "litellm>=1.81.12",
+    "nest-asyncio>=1.6.0",
+    "boto3>=1.28.0",
+    "mlflow>=3.1.4",
 ]
 
 [project.optional-dependencies]
@@ -128,3 +133,12 @@ quote-style = "double"
 indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"
+
+[dependency-groups]
+dev = [
+    "pytest>=8.4.2",
+    "pytest-asyncio>=1.2.0",
+    "pytest-cov>=7.0.0",
+    "pytest-mock>=3.14.0",
+    "responses>=0.25.3",
+]
diff --git a/rag_eval/README.md b/rag_eval/README.md
@@ -0,0 +1,94 @@
+# RAG Evaluation
+
+Evaluate a RAG (Retrieval Augmented Generation) system with custom metrics
+
+## Quick Start
+
+### 1. Set Your API Key
+
+Choose your LLM provider:
+
+```bash
+# OpenAI (default)
+export OPENAI_API_KEY="your-openai-key"
+
+# Or use Anthropic Claude
+export ANTHROPIC_API_KEY="your-anthropic-key"
+
+# Or use Google Gemini
+export GOOGLE_API_KEY="your-google-key"
+```
+
+### 2. Install Dependencies
+
+Using `uv` (recommended):
+
+```bash
+uv sync
+```
+
+Or using `pip`:
+
+```bash
+pip install -e .
+```
+
+### 3. Run the Evaluation
+
+Using `uv`:
+
+```bash
+uv run python evals.py
+```
+
+Or using `pip`:
+
+```bash
+python evals.py
+```
+
+## Project Structure
+
+```
+rag_eval/
+├── README.md           # This file
+├── pyproject.toml      # Project configuration
+├── rag.py              # Your RAG application code
+├── evals.py            # Evaluation workflow
+├── __init__.py         # Makes this a Python package
+└── evals/              # Evaluation-related data
+    ├── datasets/       # Test datasets
+    ├── experiments/    # Experiment results
+    └── logs/           # Evaluation logs and traces
+```
-```
-rag_eval/
-├── README.md           # This file
-├── pyproject.toml      # Project configuration
-├── rag.py              # Your RAG application code
-├── evals.py            # Evaluation workflow
-├── __init__.py         # Makes this a Python package
-└── evals/              # Evaluation-related data
-    ├── datasets/       # Test datasets
-    ├── experiments/    # Experiment results
-    └── logs/           # Evaluation logs and traces
-```
-```
-rag_eval/
-├── README.md           # This file
-├── pyproject.toml      # Project configuration
-├── rag.py              # Your RAG application code
-├── evals.py            # Evaluation workflow
-├── __init__.py         # Makes this a Python package
-└── evals/              # Evaluation-related data
-    ├── datasets/       # Test datasets
-    ├── experiments/    # Experiment results
-    └── logs/           # Evaluation logs and traces
-```
+
+## Customization
+
+### Modify the LLM Provider
+
+In `evals.py`, update the LLM configuration:
+
+```python
+from ragas.llms import llm_factory
+
+# Use Anthropic Claude
+llm = llm_factory("claude-3-5-sonnet-20241022", provider="anthropic")
+
+# Use Google Gemini
+llm = llm_factory("gemini-1.5-pro", provider="google")
+
+# Use local Ollama
+llm = llm_factory("mistral", provider="ollama", base_url="http://localhost:11434")
+```
+
+### Customize Test Cases
+
+Edit the `load_dataset()` function in `evals.py` to add or modify test cases.
+
+### Change Evaluation Metrics
+
+Update the `my_metric` definition in `evals.py` to use different grading criteria.
+
+## Documentation
+
+Visit https://docs.ragas.io for more information.
diff --git a/rag_eval/__init__.py b/rag_eval/__init__.py
@@ -0,0 +1,3 @@
+"""RAG evaluation package for Earthdata MCP server."""
+
+__version__ = "1.0.0"
-"""RAG evaluation package for Earthdata MCP server."""
-
-__version__ = "1.0.0"
+"""RAG evaluation package for Earthdata MCP server."""
+
+__version__ = "0.1.0"
-"""RAG evaluation package for Earthdata MCP server."""
-
-__version__ = "1.0.0"
+"""RAG evaluation package for Earthdata MCP server."""
+
+__version__ = "0.1.0"