CodeForPhilly · taichan03 · Aug 5, 2025 · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -72,11 +72,11 @@ import pandas as pd
 data = [
     {
     "MODEL": "<MODEL_NAME_1>",
-    "INSTRUCTIONS": """<YOUR_QUERY_1>"""
+    "INSTRUCTIONS": """<YOUR_INSTRUCTIONS_1>"""
     },
     {
     "MODEL": "<MODEL_NAME_2>",
-    "INSTRUCTIONS": """<YOUR_QUERY_2>"""
+    "INSTRUCTIONS": """<YOUR_INSTRUCTIONS_2>"""
     },
 ]
 
@@ -175,4 +175,32 @@ for metric in efficiency_metrics:
 
 ### Contributing
 
-You're welcome to add LLM models to test in `server/api/services/llm_services`
+#### Adding Evaluation Metrics
+
+To add new evaluation metrics, modify the `evaluate_response()` function in `evaluation/evals.py`:
+
+**Update dependencies** in script header and ensure exception handling includes new metrics with `None` values.
+
+#### Adding New LLM Models
+
+To add a new LLM model for evaluation, implement a handler in `server/api/services/llm_services.py`:
+
+1. **Create a handler class** inheriting from `BaseModelHandler`:
+2. **Register in ModelFactory** by adding to the `HANDLERS` dictionary:
+3. **Use in experiments** by referencing the handler key in your experiments CSV:
+
+The evaluation system will automatically use your handler through the Factory Method pattern.
+
+
+#### Running Tests
+
+The evaluation module includes comprehensive tests for all core functions. Run the test suite using:
+
+```sh
+uv run test_evals.py
+```
+
+The tests cover:
+- **Cost calculation** with various token usage and pricing scenarios
+- **CSV loading** with validation and error handling
+- **Response evaluation** including async operations and exception handling
diff --git a/evaluation/evals.py b/evaluation/evals.py
@@ -23,6 +23,8 @@
 
 import argparse
 import logging
+import asyncio
+import time
 
 import pandas as pd
 
@@ -37,15 +39,15 @@
 )
 
 
-def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame:
+async def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame:
     """
     Test a prompt with a set of test data by scoring each item in the data set
     """
 
     try:
         handler = ModelFactory.get_handler(model)
 
-        generated_text, token_usage, pricing, duration = handler.handle_request(
+        generated_text, token_usage, pricing, duration = await handler.handle_request(
             instructions, input
         )
 
@@ -116,19 +118,22 @@ def calculate_cost_metrics(token_usage: dict, pricing: dict) -> dict:
     }
 
 
-def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
+def load_csv(file_path: str, required_columns: list, nrows: int = None) -> pd.DataFrame:
     """
     Load a CSV file and validate that it contains the required columns
 
     Args:
         file_path (str): Path to the CSV file
         required_columns (list): List of required column names
-
+        nrows (int): Number of rows to read from the CSV file
     Returns:
         pd.DataFrame
     """
 
-    df = pd.read_csv(file_path)
+    if nrows is not None:
+        logging.info(f"Test mode enabled: Reading first {nrows} rows of {file_path}")
+
+    df = pd.read_csv(file_path, nrows=nrows)
 
     # Remove trailing whitespace from column names
     df.columns = df.columns.str.strip()
@@ -145,9 +150,7 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
     return df
 
 
-if __name__ == "__main__":
-    # TODO: Add test evaluation argument to run on the first 10 rows of the dataset file
-
+async def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--experiments", "-e", required=True, help="Path to experiments CSV file"
@@ -158,33 +161,35 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
     parser.add_argument(
         "--results", "-r", required=True, help="Path to results CSV file"
     )
+    parser.add_argument(
+        "--test", "-t", type=int, help="Run evaluation on first n rows of dataset only"
+    )
 
     args = parser.parse_args()
 
+    # Load the experiment DataFrame
     df_experiment = load_csv(
         args.experiments, required_columns=["MODEL", "INSTRUCTIONS"]
     )
-    # Check if all models are supported by ModelFactory
-    if not all(
-        model in ModelFactory.HANDLERS.keys()
-        for model in df_experiment["MODEL"].unique()
-    ):
-        raise ValueError(
-            f"Unsupported model(s) found: {set(df_experiment['MODEL'].unique()) - set(ModelFactory.HANDLERS.keys())}"
-        )
-    df_dataset = load_csv(args.dataset, required_columns=["INPUT"])
+
+    # Load the dataset DataFrame
+    df_dataset = load_csv(args.dataset, required_columns=["INPUT"], nrows=args.test)
 
     # Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames
     df_in = df_experiment.merge(df_dataset, how="cross")
 
-    # Evaluate each row in the input DataFrame
-    results = []
-    for index, row in enumerate(df_in.itertuples(index=False)):
-        result = evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT)
-        results.append(result)
+    # Evaluate each row in the input DataFrame concurrently
+    logging.info(f"Starting evaluation of {len(df_in)} rows")
+    start_time = time.time()
+    tasks = [
+        evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT)
+        for row in df_in.itertuples(index=False)
+    ]
 
-        # TODO: Use tqdm or similar library to show progress bar
-        logging.info(f"Processed row {index + 1}/{len(df_in)}")
+    results = await asyncio.gather(*tasks)
+    end_time = time.time()
+    duration = end_time - start_time
+    logging.info(f"Completed evaluation of {len(results)} rows in {duration} seconds")
 
     df_evals = pd.concat(results, axis=0, ignore_index=True)
 
@@ -195,3 +200,7 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
     df_out.to_csv(args.results, index=False)
     logging.info(f"Results saved to {args.results}")
     logging.info("Evaluation completed successfully.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())