2424import argparse
2525import logging
2626import asyncio
27+ import time
2728
2829import pandas as pd
2930
@@ -117,19 +118,22 @@ def calculate_cost_metrics(token_usage: dict, pricing: dict) -> dict:
117118 }
118119
119120
120- def load_csv (file_path : str , required_columns : list ) -> pd .DataFrame :
121+ def load_csv (file_path : str , required_columns : list , nrows : int = None ) -> pd .DataFrame :
121122 """
122123 Load a CSV file and validate that it contains the required columns
123124
124125 Args:
125126 file_path (str): Path to the CSV file
126127 required_columns (list): List of required column names
127-
128+ nrows (int): Number of rows to read from the CSV file
128129 Returns:
129130 pd.DataFrame
130131 """
131132
132- df = pd .read_csv (file_path )
133+ if nrows is not None :
134+ logging .info (f"Test mode enabled: Reading first { nrows } rows of { file_path } " )
135+
136+ df = pd .read_csv (file_path , nrows = nrows )
133137
134138 # Remove trailing whitespace from column names
135139 df .columns = df .columns .str .strip ()
@@ -147,8 +151,6 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
147151
148152
149153async def main ():
150- # TODO: Add test evaluation argument to run on the first 10 rows of the dataset file
151-
152154 parser = argparse .ArgumentParser ()
153155 parser .add_argument (
154156 "--experiments" , "-e" , required = True , help = "Path to experiments CSV file"
@@ -159,34 +161,35 @@ async def main():
159161 parser .add_argument (
160162 "--results" , "-r" , required = True , help = "Path to results CSV file"
161163 )
164+ parser .add_argument (
165+ "--test" , "-t" , type = int , help = "Run evaluation on first n rows of dataset only"
166+ )
162167
163168 args = parser .parse_args ()
164169
170+ # Load the experiment DataFrame
165171 df_experiment = load_csv (
166172 args .experiments , required_columns = ["MODEL" , "INSTRUCTIONS" ]
167173 )
168- # Check if all models are supported by ModelFactory
169- if not all (
170- model in ModelFactory .HANDLERS .keys ()
171- for model in df_experiment ["MODEL" ].unique ()
172- ):
173- raise ValueError (
174- f"Unsupported model(s) found: { set (df_experiment ['MODEL' ].unique ()) - set (ModelFactory .HANDLERS .keys ())} "
175- )
176- df_dataset = load_csv (args .dataset , required_columns = ["INPUT" ])
174+
175+ # Load the dataset DataFrame
176+ df_dataset = load_csv (args .dataset , required_columns = ["INPUT" ], nrows = args .test )
177177
178178 # Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames
179179 df_in = df_experiment .merge (df_dataset , how = "cross" )
180180
181181 # Evaluate each row in the input DataFrame concurrently
182182 logging .info (f"Starting evaluation of { len (df_in )} rows" )
183+ start_time = time .time ()
183184 tasks = [
184185 evaluate_response (row .MODEL , row .INSTRUCTIONS , row .INPUT )
185186 for row in df_in .itertuples (index = False )
186187 ]
187188
188189 results = await asyncio .gather (* tasks )
189- logging .info (f"Completed evaluation of { len (results )} rows" )
190+ end_time = time .time ()
191+ duration = end_time - start_time
192+ logging .info (f"Completed evaluation of { len (results )} rows in { duration } seconds" )
190193
191194 df_evals = pd .concat (results , axis = 0 , ignore_index = True )
192195
0 commit comments