Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 31 additions & 3 deletions evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,11 @@ import pandas as pd
data = [
{
"MODEL": "<MODEL_NAME_1>",
"INSTRUCTIONS": """<YOUR_QUERY_1>"""
"INSTRUCTIONS": """<YOUR_INSTRUCTIONS_1>"""
},
{
"MODEL": "<MODEL_NAME_2>",
"INSTRUCTIONS": """<YOUR_QUERY_2>"""
"INSTRUCTIONS": """<YOUR_INSTRUCTIONS_2>"""
},
]

Expand Down Expand Up @@ -175,4 +175,32 @@ for metric in efficiency_metrics:

### Contributing

You're welcome to add LLM models to test in `server/api/services/llm_services`
#### Adding Evaluation Metrics

To add new evaluation metrics, modify the `evaluate_response()` function in `evaluation/evals.py`:

**Update dependencies** in script header and ensure exception handling includes new metrics with `None` values.

#### Adding New LLM Models

To add a new LLM model for evaluation, implement a handler in `server/api/services/llm_services.py`:

1. **Create a handler class** inheriting from `BaseModelHandler`:
2. **Register in ModelFactory** by adding to the `HANDLERS` dictionary:
3. **Use in experiments** by referencing the handler key in your experiments CSV:

The evaluation system will automatically use your handler through the Factory Method pattern.


#### Running Tests

The evaluation module includes comprehensive tests for all core functions. Run the test suite using:

```sh
uv run test_evals.py
```

The tests cover:
- **Cost calculation** with various token usage and pricing scenarios
- **CSV loading** with validation and error handling
- **Response evaluation** including async operations and exception handling
57 changes: 33 additions & 24 deletions evaluation/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

import argparse
import logging
import asyncio
import time

import pandas as pd

Expand All @@ -37,15 +39,15 @@
)


def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame:
async def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame:
"""
Test a prompt with a set of test data by scoring each item in the data set
"""

try:
handler = ModelFactory.get_handler(model)

generated_text, token_usage, pricing, duration = handler.handle_request(
generated_text, token_usage, pricing, duration = await handler.handle_request(
instructions, input
)

Expand Down Expand Up @@ -116,19 +118,22 @@ def calculate_cost_metrics(token_usage: dict, pricing: dict) -> dict:
}


def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
def load_csv(file_path: str, required_columns: list, nrows: int = None) -> pd.DataFrame:
"""
Load a CSV file and validate that it contains the required columns

Args:
file_path (str): Path to the CSV file
required_columns (list): List of required column names

nrows (int): Number of rows to read from the CSV file
Returns:
pd.DataFrame
"""

df = pd.read_csv(file_path)
if nrows is not None:
logging.info(f"Test mode enabled: Reading first {nrows} rows of {file_path}")

df = pd.read_csv(file_path, nrows=nrows)

# Remove trailing whitespace from column names
df.columns = df.columns.str.strip()
Expand All @@ -145,9 +150,7 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
return df


if __name__ == "__main__":
# TODO: Add test evaluation argument to run on the first 10 rows of the dataset file

async def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--experiments", "-e", required=True, help="Path to experiments CSV file"
Expand All @@ -158,33 +161,35 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
parser.add_argument(
"--results", "-r", required=True, help="Path to results CSV file"
)
parser.add_argument(
"--test", "-t", type=int, help="Run evaluation on first n rows of dataset only"
)

args = parser.parse_args()

# Load the experiment DataFrame
df_experiment = load_csv(
args.experiments, required_columns=["MODEL", "INSTRUCTIONS"]
)
# Check if all models are supported by ModelFactory
if not all(
model in ModelFactory.HANDLERS.keys()
for model in df_experiment["MODEL"].unique()
):
raise ValueError(
f"Unsupported model(s) found: {set(df_experiment['MODEL'].unique()) - set(ModelFactory.HANDLERS.keys())}"
)
df_dataset = load_csv(args.dataset, required_columns=["INPUT"])

# Load the dataset DataFrame
df_dataset = load_csv(args.dataset, required_columns=["INPUT"], nrows=args.test)

# Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames
df_in = df_experiment.merge(df_dataset, how="cross")

# Evaluate each row in the input DataFrame
results = []
for index, row in enumerate(df_in.itertuples(index=False)):
result = evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT)
results.append(result)
# Evaluate each row in the input DataFrame concurrently
logging.info(f"Starting evaluation of {len(df_in)} rows")
start_time = time.time()
tasks = [
evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT)
for row in df_in.itertuples(index=False)
]

# TODO: Use tqdm or similar library to show progress bar
logging.info(f"Processed row {index + 1}/{len(df_in)}")
results = await asyncio.gather(*tasks)
end_time = time.time()
duration = end_time - start_time
logging.info(f"Completed evaluation of {len(results)} rows in {duration} seconds")

df_evals = pd.concat(results, axis=0, ignore_index=True)

Expand All @@ -195,3 +200,7 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
df_out.to_csv(args.results, index=False)
logging.info(f"Results saved to {args.results}")
logging.info("Evaluation completed successfully.")


if __name__ == "__main__":
asyncio.run(main())
Loading