Skip to content

Commit 252071e

Browse files
committed
ADD Add support for limiting CSV rows during test execution and include timing for evaluation
1 parent 11a3041 commit 252071e

File tree

1 file changed

+18
-15
lines changed

1 file changed

+18
-15
lines changed

evaluation/evals.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import argparse
2525
import logging
2626
import asyncio
27+
import time
2728

2829
import pandas as pd
2930

@@ -117,19 +118,22 @@ def calculate_cost_metrics(token_usage: dict, pricing: dict) -> dict:
117118
}
118119

119120

120-
def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
121+
def load_csv(file_path: str, required_columns: list, nrows: int = None) -> pd.DataFrame:
121122
"""
122123
Load a CSV file and validate that it contains the required columns
123124
124125
Args:
125126
file_path (str): Path to the CSV file
126127
required_columns (list): List of required column names
127-
128+
nrows (int): Number of rows to read from the CSV file
128129
Returns:
129130
pd.DataFrame
130131
"""
131132

132-
df = pd.read_csv(file_path)
133+
if nrows is not None:
134+
logging.info(f"Test mode enabled: Reading first {nrows} rows of {file_path}")
135+
136+
df = pd.read_csv(file_path, nrows=nrows)
133137

134138
# Remove trailing whitespace from column names
135139
df.columns = df.columns.str.strip()
@@ -147,8 +151,6 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
147151

148152

149153
async def main():
150-
# TODO: Add test evaluation argument to run on the first 10 rows of the dataset file
151-
152154
parser = argparse.ArgumentParser()
153155
parser.add_argument(
154156
"--experiments", "-e", required=True, help="Path to experiments CSV file"
@@ -159,34 +161,35 @@ async def main():
159161
parser.add_argument(
160162
"--results", "-r", required=True, help="Path to results CSV file"
161163
)
164+
parser.add_argument(
165+
"--test", "-t", type=int, help="Run evaluation on first n rows of dataset only"
166+
)
162167

163168
args = parser.parse_args()
164169

170+
# Load the experiment DataFrame
165171
df_experiment = load_csv(
166172
args.experiments, required_columns=["MODEL", "INSTRUCTIONS"]
167173
)
168-
# Check if all models are supported by ModelFactory
169-
if not all(
170-
model in ModelFactory.HANDLERS.keys()
171-
for model in df_experiment["MODEL"].unique()
172-
):
173-
raise ValueError(
174-
f"Unsupported model(s) found: {set(df_experiment['MODEL'].unique()) - set(ModelFactory.HANDLERS.keys())}"
175-
)
176-
df_dataset = load_csv(args.dataset, required_columns=["INPUT"])
174+
175+
# Load the dataset DataFrame
176+
df_dataset = load_csv(args.dataset, required_columns=["INPUT"], nrows=args.test)
177177

178178
# Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames
179179
df_in = df_experiment.merge(df_dataset, how="cross")
180180

181181
# Evaluate each row in the input DataFrame concurrently
182182
logging.info(f"Starting evaluation of {len(df_in)} rows")
183+
start_time = time.time()
183184
tasks = [
184185
evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT)
185186
for row in df_in.itertuples(index=False)
186187
]
187188

188189
results = await asyncio.gather(*tasks)
189-
logging.info(f"Completed evaluation of {len(results)} rows")
190+
end_time = time.time()
191+
duration = end_time - start_time
192+
logging.info(f"Completed evaluation of {len(results)} rows in {duration} seconds")
190193

191194
df_evals = pd.concat(results, axis=0, ignore_index=True)
192195

0 commit comments

Comments
 (0)