Skip to content

Commit 3c9a1c9

Browse files
committed
Update README for clearer instructions, refactor evals.py for better error handling, column validation, and batch processing
1 parent 81cecae commit 3c9a1c9

File tree

2 files changed

+133
-108
lines changed

2 files changed

+133
-108
lines changed

evaluation/README.md

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ API usage:
2121
* Estimated cost in USD
2222
* Duration (in seconds)
2323

24-
### Test Data:
24+
### Test Data
2525

26-
Generate the reference file by connecting to a database of references
26+
Generate the dataset file by connecting to a database of references
2727

2828
Connect to the Postgres database of your local Balancer instance:
2929

@@ -48,7 +48,7 @@ from sqlalchemy import create_engine
4848
engine = create_engine("postgresql://<USER>@localhost:5432/<DB_NAME>")
4949
```
5050

51-
Generate the reference CSV file:
51+
Generate the dataset CSV file:
5252

5353
```
5454
import pandas as pd
@@ -63,7 +63,7 @@ df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
6363
df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index()
6464
6565
df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'})
66-
df_grouped.to_csv('<REFERENCE_CSV_PATH>', index=False)
66+
df_grouped.to_csv('<DATASET_CSV_PATH>', index=False)
6767
```
6868

6969

@@ -94,7 +94,7 @@ data = [
9494
df = pd.DataFrame.from_records(data)
9595
9696
# Write to CSV
97-
df.to_csv("<CONFIG_CSV_PATH>", index=False)
97+
df.to_csv("<EXPERIMENTS_CSV_PATH>", index=False)
9898
```
9999

100100

@@ -104,13 +104,13 @@ df.to_csv("<CONFIG_CSV_PATH>", index=False)
104104
Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments:
105105

106106
```sh
107-
uv run evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
107+
uv run evals.py --experiments path/to/<EXPERIMENTS_CSV> --dataset path/to/<DATASET_CSV> --results path/to/<RESULTS_CSV>
108108
```
109109

110110
Execute without using uv run by ensuring it is executable:
111111

112112
```sh
113-
./evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
113+
./evals.py --experiments path/to/<EXPERIMENTS_CSV> --dataset path/to/<DATASET_CSV> --results path/to/<RESULTS_CSV>
114114
```
115115

116116
### Analyzing Test Results
@@ -120,7 +120,7 @@ import pandas as pd
120120
import matplotlib.pyplot as plt
121121
import numpy as np
122122
123-
df = pd.read_csv("<OUTPUT_CSV_PATH>")
123+
df = pd.read_csv("<RESULTS_CSV_PATH>")
124124
125125
# Define the metrics of interest
126126
extractiveness_cols = ['Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression']
@@ -132,15 +132,15 @@ all_metrics = extractiveness_cols + token_cols + other_metrics
132132
plt.style.use('default')
133133
fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics)))
134134
135-
models = df['Model Name'].unique()
135+
models = df['MODEL'].unique()
136136
colors = plt.cm.Set3(np.linspace(0, 1, len(models)))
137137
138138
for i, metric in enumerate(all_metrics):
139139
ax = axes[i] if len(all_metrics) > 1 else axes
140140
141141
# Create histogram for each model
142142
for j, model in enumerate(models):
143-
model_data = df[df['Model Name'] == model][metric]
143+
model_data = df[df['MODEL'] == model][metric]
144144
ax.hist(model_data, alpha=0.7, label=model, bins=min(8, len(model_data)),
145145
color=colors[j], edgecolor='black', linewidth=0.5)
146146
@@ -152,5 +152,4 @@ for i, metric in enumerate(all_metrics):
152152
153153
plt.tight_layout()
154154
plt.show()
155-
156155
```

evaluation/evals.py

Lines changed: 123 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,6 @@
1212
Evaluate LLM outputs using multiple metrics and compute associated costs
1313
"""
1414

15-
# This script evaluates LLM outputs using the `lighteval` library
16-
# https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
17-
18-
# This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist
19-
20-
21-
# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
22-
2315
import sys
2416
import os
2517

@@ -30,6 +22,8 @@
3022
import logging
3123

3224
import pandas as pd
25+
26+
# lighteval depends on `sentencepiece` and it only has prebuilt wheels for Python 3.11 or below
3327
from lighteval.tasks.requests import Doc
3428
from lighteval.metrics.metrics_sample import Extractiveness
3529

@@ -40,129 +34,161 @@
4034
)
4135

4236

43-
def evaluate_response(model_name: str, query: str, context: str) -> pd.DataFrame:
37+
def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame:
38+
"""
39+
Test a prompt with a set of test data by scoring each item in the data set
4440
"""
45-
Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost
4641

47-
Args:
48-
model_name (str): The name of the model to be used for evaluation.
49-
query (str): The user query to be processed.
50-
context (str): The context or document content to be used.
51-
reference (str): The reference text for comparison (not used in this function, but can be used for further evaluations).
42+
try:
43+
handler = ModelFactory.get_handler(model)
5244

53-
Returns:
54-
pd.DataFrame: A DataFrame containing the output text, extractiveness metrics, token usage, cost, and duration.
55-
"""
45+
generated_text, token_usage, pricing, duration = handler.handle_request(
46+
instructions, input
47+
)
5648

57-
handler = ModelFactory.get_handler(model_name)
49+
doc = Doc(query="", choices=[], gold_index=0, specific={"text": input})
50+
extractiveness = Extractiveness().compute(
51+
formatted_doc=doc, predictions=[generated_text]
52+
)
5853

59-
# TODO: Add error handling for unsupported models
54+
cost_metrics = calculate_cost_metrics(token_usage, pricing)
6055

61-
output_text, token_usage, pricing, duration = handler.handle_request(query, context)
56+
result = pd.DataFrame(
57+
[
58+
{
59+
"Generated Text": generated_text,
60+
"Extractiveness Coverage": extractiveness["summarization_coverage"],
61+
"Extractiveness Density": extractiveness["summarization_density"],
62+
"Extractiveness Compression": extractiveness[
63+
"summarization_compression"
64+
],
65+
"Input Token Usage": token_usage.input_tokens,
66+
"Output Token Usage": token_usage.output_tokens,
67+
"Cost (USD)": cost_metrics["total_cost"],
68+
"Duration (s)": duration,
69+
}
70+
]
71+
)
6272

63-
doc = Doc(query="", choices=[], gold_index=0, specific={"text": context})
64-
extractiveness = Extractiveness().compute(
65-
formatted_doc=doc, predictions=[output_text]
66-
)
73+
except Exception as e:
74+
logging.error(f"Error evaluating response for model {model}: {e}")
75+
result = pd.DataFrame(
76+
[
77+
{
78+
"Generated Text": None,
79+
"Extractiveness Coverage": None,
80+
"Extractiveness Density": None,
81+
"Extractiveness Compression": None,
82+
"Input Token Usage": None,
83+
"Output Token Usage": None,
84+
"Cost (USD)": None,
85+
"Duration (s)": None,
86+
}
87+
]
88+
)
89+
90+
return result
91+
92+
93+
def calculate_cost_metrics(token_usage: dict, pricing: dict) -> dict:
94+
"""
95+
Calculate cost metrics based on token usage and pricing
96+
"""
6797

68-
input_cost_dollars = (pricing["input"] / 1000000) * token_usage.input_tokens
69-
output_cost_dollars = (pricing["output"] / 1000000) * token_usage.output_tokens
98+
TOKENS_PER_MILLION = 1_000_000
7099

100+
# Pricing is in dollars per million tokens
101+
input_cost_dollars = (
102+
pricing["input"] / TOKENS_PER_MILLION
103+
) * token_usage.input_tokens
104+
output_cost_dollars = (
105+
pricing["output"] / TOKENS_PER_MILLION
106+
) * token_usage.output_tokens
71107
total_cost_dollars = input_cost_dollars + output_cost_dollars
72108

73-
return pd.DataFrame(
74-
[
75-
{
76-
"Output Text": output_text,
77-
"Extractiveness Coverage": extractiveness["summarization_coverage"],
78-
"Extractiveness Density": extractiveness["summarization_density"],
79-
"Extractiveness Compression": extractiveness[
80-
"summarization_compression"
81-
],
82-
"Input Token Usage": token_usage.input_tokens,
83-
"Output Token Usage": token_usage.output_tokens,
84-
"Cost (USD)": total_cost_dollars,
85-
"Duration (s)": duration,
86-
}
87-
]
88-
)
109+
return {
110+
"input_cost": input_cost_dollars,
111+
"output_cost": output_cost_dollars,
112+
"total_cost": total_cost_dollars,
113+
}
89114

90115

91-
if __name__ == "__main__":
92-
# TODO: Add test evaluation argument to run on the first 10 rows of the config file
116+
def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
117+
"""
118+
Load a CSV file and validate that it contains the required columns
93119
94-
# TODO: Add CLI argument to specify the metrics to be computed
95-
parser = argparse.ArgumentParser(
96-
description="Evaluate LLM outputs using multiple metrics and compute associated costs"
97-
)
98-
parser.add_argument("--config", "-c", required=True, help="Path to config CSV file")
99-
parser.add_argument(
100-
"--reference", "-r", required=True, help="Path to reference CSV file"
101-
)
102-
parser.add_argument("--output", "-o", required=True, help="Path to output CSV file")
120+
Args:
121+
file_path (str): Path to the CSV file
122+
required_columns (list): List of required column names
103123
104-
args = parser.parse_args()
124+
Returns:
125+
pd.DataFrame
126+
"""
127+
128+
df = pd.read_csv(file_path)
105129

106-
df_config = pd.read_csv(args.config)
107-
logging.info(f"Config DataFrame shape: {df_config.shape}")
108-
logging.info(f"Config DataFrame columns: {df_config.columns.tolist()}")
130+
# Remove trailing whitespace from column names
131+
df.columns = df.columns.str.strip()
109132

110-
# Remove the trailing whitespace from column names
111-
df_config.columns = df_config.columns.str.strip()
133+
# Uppercase the column names to match the expected format
134+
df.columns = df.columns.str.upper()
112135

113136
# Check if the required columns are present
114-
# TODO: Make this more flexible by allowing the user to use default instructions
115-
required_columns = ["Model Name", "Query"]
116-
if not all(col in df_config.columns for col in required_columns):
137+
if not all(col in df.columns for col in required_columns):
117138
raise ValueError(
118-
f"Config DataFrame must contain the following columns: {required_columns}"
139+
f"{file_path} must contain the following columns: {required_columns}"
119140
)
120141

121-
# Check if all models in the config are supported by ModelFactory
142+
return df
143+
144+
145+
if __name__ == "__main__":
146+
# TODO: Add test evaluation argument to run on the first 10 rows of the dataset file
147+
148+
parser = argparse.ArgumentParser()
149+
parser.add_argument(
150+
"--experiments", "-e", required=True, help="Path to experiments CSV file"
151+
)
152+
parser.add_argument(
153+
"--dataset", "-d", required=True, help="Path to dataset CSV file"
154+
)
155+
parser.add_argument(
156+
"--results", "-r", required=True, help="Path to results CSV file"
157+
)
158+
159+
args = parser.parse_args()
160+
161+
df_experiment = load_csv(
162+
args.experiments, required_columns=["MODEL", "INSTRUCTIONS"]
163+
)
164+
# Check if all models are supported by ModelFactory
122165
if not all(
123166
model in ModelFactory.HANDLERS.keys()
124-
for model in df_config["Model Name"].unique()
167+
for model in df_experiment["MODEL"].unique()
125168
):
126169
raise ValueError(
127-
f"Unsupported model(s) found in config: {set(df_config['Model Name'].unique()) - set(ModelFactory.HANDLERS.keys())}"
128-
)
129-
130-
df_reference = pd.read_csv(args.reference)
131-
logging.info(f"Reference DataFrame shape: {df_reference.shape}")
132-
logging.info(f"Reference DataFrame columns: {df_reference.columns.tolist()}")
133-
134-
# Remove the trailing whitespace from column names
135-
df_reference.columns = df_reference.columns.str.strip()
136-
# Check if the required columns are present
137-
required_columns = ["Context"]
138-
if not all(col in df_reference.columns for col in required_columns):
139-
raise ValueError(
140-
f"Reference DataFrame must contain the following columns: {required_columns}"
170+
f"Unsupported model(s) found: {set(df_experiment['MODEL'].unique()) - set(ModelFactory.HANDLERS.keys())}"
141171
)
172+
df_dataset = load_csv(args.dataset, required_columns=["INPUT"])
142173

143-
# Cross join the config and reference DataFrames
144-
df_in = df_config.merge(df_reference, how="cross")
174+
# Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames
175+
df_in = df_experiment.merge(df_dataset, how="cross")
145176

146-
# TODO: Parallelize the evaluation process for each row in df_in using concurrent.futures or similar libraries
147-
df_evals = pd.DataFrame()
148-
for index, row in df_in.iterrows():
149-
df_evals = pd.concat(
150-
[
151-
df_evals,
152-
evaluate_response(row["Model Name"], row["Query"], row["Context"]),
153-
],
154-
axis=0,
155-
)
177+
# Evaluate each row in the input DataFrame
178+
results = []
179+
for index, row in enumerate(df_in.itertuples(index=False)):
180+
result = evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT)
181+
results.append(result)
156182

183+
# TODO: Use tqdm or similar library to show progress bar
157184
logging.info(f"Processed row {index + 1}/{len(df_in)}")
158185

159-
# Concatenate the input and evaluations DataFrames
186+
df_evals = pd.concat(results, axis=0, ignore_index=True)
160187

188+
# Concatenate the input and evaluations DataFrames
161189
df_out = pd.concat(
162190
[df_in.reset_index(drop=True), df_evals.reset_index(drop=True)], axis=1
163191
)
164-
165-
df_out.to_csv(args.output, index=False)
166-
logging.info(f"Output DataFrame shape: {df_out.shape}")
167-
logging.info(f"Results saved to {args.output}")
192+
df_out.to_csv(args.results, index=False)
193+
logging.info(f"Results saved to {args.results}")
168194
logging.info("Evaluation completed successfully.")

0 commit comments

Comments
 (0)