From 946d6c4c420e12a9ca183dbbb2821e1076171f2a Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Wed, 23 Jul 2025 12:36:58 -0400 Subject: [PATCH 1/6] Update evaluation README --- evaluation/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evaluation/README.md b/evaluation/README.md index 669141d8..f597e389 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -72,11 +72,11 @@ import pandas as pd data = [ { "MODEL": "", - "INSTRUCTIONS": """""" + "INSTRUCTIONS": """""" }, { "MODEL": "", - "INSTRUCTIONS": """""" + "INSTRUCTIONS": """""" }, ] From 36e9e2d46e16ba25d3446b813ec4370779c97982 Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Wed, 23 Jul 2025 13:12:00 -0400 Subject: [PATCH 2/6] Optimize evaluation performance using AsyncOpenAI for concurrent processing --- evaluation/evals.py | 26 ++++++++++++++++---------- server/api/services/llm_services.py | 16 ++++++++-------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/evaluation/evals.py b/evaluation/evals.py index 08eda2bc..2d88bf65 100755 --- a/evaluation/evals.py +++ b/evaluation/evals.py @@ -23,6 +23,7 @@ import argparse import logging +import asyncio import pandas as pd @@ -37,7 +38,7 @@ ) -def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame: +async def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame: """ Test a prompt with a set of test data by scoring each item in the data set """ @@ -45,7 +46,7 @@ def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame try: handler = ModelFactory.get_handler(model) - generated_text, token_usage, pricing, duration = handler.handle_request( + generated_text, token_usage, pricing, duration = await handler.handle_request( instructions, input ) @@ -145,7 +146,7 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame: return df -if __name__ == "__main__": +async def main(): # TODO: Add test evaluation argument to run on the first 10 rows of the dataset file parser = argparse.ArgumentParser() @@ -177,14 +178,15 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame: # Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames df_in = df_experiment.merge(df_dataset, how="cross") - # Evaluate each row in the input DataFrame - results = [] - for index, row in enumerate(df_in.itertuples(index=False)): - result = evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT) - results.append(result) + # Evaluate each row in the input DataFrame concurrently + logging.info(f"Starting evaluation of {len(df_in)} rows") + tasks = [ + evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT) + for row in df_in.itertuples(index=False) + ] - # TODO: Use tqdm or similar library to show progress bar - logging.info(f"Processed row {index + 1}/{len(df_in)}") + results = await asyncio.gather(*tasks) + logging.info(f"Completed evaluation of {len(results)} rows") df_evals = pd.concat(results, axis=0, ignore_index=True) @@ -195,3 +197,7 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame: df_out.to_csv(args.results, index=False) logging.info(f"Results saved to {args.results}") logging.info("Evaluation completed successfully.") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py index 18c6e58f..69df8172 100644 --- a/server/api/services/llm_services.py +++ b/server/api/services/llm_services.py @@ -7,12 +7,12 @@ import logging from abc import ABC, abstractmethod -import openai +from openai import AsyncOpenAI class BaseModelHandler(ABC): @abstractmethod - def handle_request( + async def handle_request( self, query: str, context: str ) -> tuple[str, dict[str, int], dict[str, float], float]: pass @@ -31,9 +31,9 @@ class GPT4OMiniHandler(BaseModelHandler): PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.15, "output": 0.60} def __init__(self) -> None: - self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + self.client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY")) - def handle_request( + async def handle_request( self, query: str, context: str ) -> tuple[str, dict[str, int], dict[str, float], float]: """ @@ -46,7 +46,7 @@ def handle_request( """ start_time = time.time() # TODO: Add error handling for API requests and invalid responses - response = self.client.responses.create( + response = await self.client.responses.create( model=self.MODEL, instructions=query, input=context, temperature=0.0 ) duration = time.time() - start_time @@ -123,9 +123,9 @@ class GPT41NanoHandler(BaseModelHandler): """ def __init__(self) -> None: - self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + self.client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY")) - def handle_request( + async def handle_request( self, query: str, context: str ) -> tuple[str, dict[str, int], dict[str, float], float]: """ @@ -144,7 +144,7 @@ def handle_request( start_time = time.time() # TODO: Add error handling for API requests and invalid responses - response = self.client.responses.create( + response = await self.client.responses.create( model=self.MODEL, instructions=query, input=context, temperature=0.0 ) duration = time.time() - start_time From ac4a09e9095ef17838759898c55aaf9b324a3ee7 Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Wed, 23 Jul 2025 16:01:05 -0400 Subject: [PATCH 3/6] Add comprehensive pytest test suite for evaluation module --- evaluation/test_evals.py | 239 ++++++++++++++++++++++++++++++++------- 1 file changed, 199 insertions(+), 40 deletions(-) diff --git a/evaluation/test_evals.py b/evaluation/test_evals.py index f41817c6..e6d0916d 100644 --- a/evaluation/test_evals.py +++ b/evaluation/test_evals.py @@ -1,53 +1,212 @@ - -from unittest.mock import patch, MagicMock +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = "==3.11.11" +# dependencies = [ +# "pandas==2.2.3", +# "lighteval==0.10.0", +# "openai==1.83.0", +# "spacy==3.8.7", +# "pytest==8.3.3", +# "pytest-asyncio==0.24.0", +# "pip" +# ] +# /// import pytest import pandas as pd +from unittest.mock import Mock, patch, AsyncMock +import tempfile +import os +import sys + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +from evaluation.evals import evaluate_response, calculate_cost_metrics, load_csv + -from evals import evaluate_response +@pytest.fixture +def mock_token_usage(): + token_usage = Mock() + token_usage.input_tokens = 100 + token_usage.output_tokens = 50 + return token_usage -class MockTokenUsage: - def __init__(self, input_tokens, output_tokens): - self.input_tokens = input_tokens - self.output_tokens = output_tokens -@patch("evals.ModelFactory.get_handler") -@patch("evals.Extractiveness.compute") -def test_evaluate_response(mock_extractiveness_compute, mock_get_handler): +@pytest.fixture +def temp_csv(): + def _create_csv(content): + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + f.write(content) + return f.name - # Mock BaseModelHandler - mock_handler = MagicMock() - mock_handler.handle_request.return_value = ( - "This is a summary.", - MockTokenUsage(input_tokens=100, output_tokens=50), - {"input": 15.0, "output": 30.0}, # $15 and $30 per 1M tokens - 1.23, # duration + return _create_csv + + +class TestCalculateCostMetrics: + @pytest.mark.parametrize( + "input_tokens,output_tokens,input_price,output_price,expected_input,expected_output,expected_total", + [ + (1000, 500, 5.0, 15.0, 0.005, 0.0075, 0.0125), + (0, 0, 5.0, 15.0, 0.0, 0.0, 0.0), + (1_000_000, 2_000_000, 10.0, 30.0, 10.0, 60.0, 70.0), + ], ) + def test_calculate_cost_metrics( + self, + input_tokens, + output_tokens, + input_price, + output_price, + expected_input, + expected_output, + expected_total, + ): + token_usage = Mock(input_tokens=input_tokens, output_tokens=output_tokens) + pricing = {"input": input_price, "output": output_price} + + result = calculate_cost_metrics(token_usage, pricing) - mock_get_handler.return_value = mock_handler + assert pytest.approx(result["input_cost"]) == expected_input + assert pytest.approx(result["output_cost"]) == expected_output + assert pytest.approx(result["total_cost"]) == expected_total - mock_extractiveness_compute.return_value = { - "summarization_coverage": 0.8, - "summarization_density": 1.5, - "summarization_compression": 2.0, - } - df = evaluate_response( - model_name="mock-model", - query="What is the summary?", - context="This is a long article about something important.", - reference="This is a reference summary.", +class TestLoadCsv: + @pytest.mark.parametrize( + "csv_content,required_columns,expected_len", + [ + ( + "model,instructions\ngpt-4,Test prompt\ngpt-3.5,Another prompt\n", + ["MODEL", "INSTRUCTIONS"], + 2, + ), + ( + " model , instructions \ngpt-4,Test prompt\n", + ["MODEL", "INSTRUCTIONS"], + 1, + ), + ("input\nTest input 1\nTest input 2\n", ["INPUT"], 2), + ], ) + def test_load_csv_valid( + self, temp_csv, csv_content, required_columns, expected_len + ): + temp_path = temp_csv(csv_content) + try: + df = load_csv(temp_path, required_columns) + assert len(df) == expected_len + assert list(df.columns) == required_columns + finally: + os.unlink(temp_path) + + @pytest.mark.parametrize( + "csv_content,required_columns", + [ + ("model,prompt\ngpt-4,Test prompt\n", ["MODEL", "INSTRUCTIONS"]), + ("wrong,columns\nval1,val2\n", ["MODEL", "INSTRUCTIONS"]), + ], + ) + def test_load_csv_missing_columns(self, temp_csv, csv_content, required_columns): + temp_path = temp_csv(csv_content) + try: + with pytest.raises(ValueError, match="must contain the following columns"): + load_csv(temp_path, required_columns) + finally: + os.unlink(temp_path) + + def test_load_csv_nonexistent_file(self): + with pytest.raises(FileNotFoundError): + load_csv("nonexistent_file.csv", ["MODEL"]) + + +class TestEvaluateResponse: + @pytest.mark.asyncio + async def test_evaluate_response_success(self, mock_token_usage): + mock_handler = AsyncMock() + mock_handler.handle_request.return_value = ( + "Generated response text", + mock_token_usage, + {"input": 5.0, "output": 15.0}, + 1.5, + ) + + mock_extractiveness = Mock() + mock_extractiveness.compute.return_value = { + "summarization_coverage": 0.8, + "summarization_density": 0.6, + "summarization_compression": 0.4, + } + + with ( + patch( + "evaluation.evals.ModelFactory.get_handler", return_value=mock_handler + ), + patch("evaluation.evals.Extractiveness", return_value=mock_extractiveness), + ): + result = await evaluate_response("gpt-4", "Test instructions", "Test input") + + assert isinstance(result, pd.DataFrame) + assert len(result) == 1 + row = result.iloc[0] + assert row["Generated Text"] == "Generated response text" + assert row["Extractiveness Coverage"] == 0.8 + assert row["Input Token Usage"] == 100 + assert row["Output Token Usage"] == 50 + assert row["Duration (s)"] == 1.5 + + @pytest.mark.parametrize( + "exception_side_effect", ["get_handler", "handle_request", "extractiveness"] + ) + @pytest.mark.asyncio + async def test_evaluate_response_exceptions( + self, mock_token_usage, exception_side_effect + ): + if exception_side_effect == "get_handler": + with patch( + "evaluation.evals.ModelFactory.get_handler", + side_effect=Exception("Test error"), + ): + result = await evaluate_response( + "invalid-model", "Test instructions", "Test input" + ) + + elif exception_side_effect == "handle_request": + mock_handler = AsyncMock() + mock_handler.handle_request.side_effect = Exception("Handler error") + with patch( + "evaluation.evals.ModelFactory.get_handler", return_value=mock_handler + ): + result = await evaluate_response( + "gpt-4", "Test instructions", "Test input" + ) + + elif exception_side_effect == "extractiveness": + mock_handler = AsyncMock() + mock_handler.handle_request.return_value = ( + "text", + mock_token_usage, + {"input": 5.0, "output": 15.0}, + 1.5, + ) + mock_extractiveness = Mock() + mock_extractiveness.compute.side_effect = Exception("Extractiveness error") + + with ( + patch( + "evaluation.evals.ModelFactory.get_handler", + return_value=mock_handler, + ), + patch( + "evaluation.evals.Extractiveness", return_value=mock_extractiveness + ), + ): + result = await evaluate_response( + "gpt-4", "Test instructions", "Test input" + ) + + assert isinstance(result, pd.DataFrame) + assert len(result) == 1 + assert pd.isna(result.iloc[0]["Generated Text"]) + - assert isinstance(df, pd.DataFrame) - assert df.shape == (1, 8) - assert df["Output Text"].iloc[0] == "This is a summary." - assert df["Extractiveness Coverage"].iloc[0] == 0.8 - assert df["Extractiveness Density"].iloc[0] == 1.5 - assert df["Extractiveness Compression"].iloc[0] == 2.0 - assert df["Input Token Usage"].iloc[0] == 100 - assert df["Output Token Usage"].iloc[0] == 50 - - expected_cost = (15.0 / 1_000_000) * 100 + (30.0 / 1_000_000) * 50 - assert pytest.approx(df["Cost (USD)"].iloc[0], rel=1e-4) == expected_cost - assert pytest.approx(df["Duration (s)"].iloc[0], rel=1e-4) == 1.23 \ No newline at end of file +if __name__ == "__main__": + pytest.main([__file__]) From f95ed69632e56f43602e33f385d3ef8b9504bc1d Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Wed, 23 Jul 2025 16:32:18 -0400 Subject: [PATCH 4/6] Expand documentation for adding evaluation metrics and LLM models, including testing instructions --- evaluation/README.md | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/evaluation/README.md b/evaluation/README.md index f597e389..a32df682 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -175,4 +175,32 @@ for metric in efficiency_metrics: ### Contributing -You're welcome to add LLM models to test in `server/api/services/llm_services` \ No newline at end of file +#### Adding Evaluation Metrics + +To add new evaluation metrics, modify the `evaluate_response()` function in `evaluation/evals.py`: + +**Update dependencies** in script header and ensure exception handling includes new metrics with `None` values. + +#### Adding New LLM Models + +To add a new LLM model for evaluation, implement a handler in `server/api/services/llm_services.py`: + +1. **Create a handler class** inheriting from `BaseModelHandler`: +2. **Register in ModelFactory** by adding to the `HANDLERS` dictionary: +3. **Use in experiments** by referencing the handler key in your experiments CSV: + +The evaluation system will automatically use your handler through the Factory Method pattern. + + +#### Running Tests + +The evaluation module includes comprehensive tests for all core functions. Run the test suite using: + +```sh +uv run test_evals.py +``` + +The tests cover: +- **Cost calculation** with various token usage and pricing scenarios +- **CSV loading** with validation and error handling +- **Response evaluation** including async operations and exception handling From 252071ecb0597d41e9afd597d1ebd403cc948593 Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Tue, 29 Jul 2025 12:10:44 -0400 Subject: [PATCH 5/6] ADD Add support for limiting CSV rows during test execution and include timing for evaluation --- evaluation/evals.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/evaluation/evals.py b/evaluation/evals.py index 2d88bf65..8eb7e9e6 100755 --- a/evaluation/evals.py +++ b/evaluation/evals.py @@ -24,6 +24,7 @@ import argparse import logging import asyncio +import time import pandas as pd @@ -117,19 +118,22 @@ def calculate_cost_metrics(token_usage: dict, pricing: dict) -> dict: } -def load_csv(file_path: str, required_columns: list) -> pd.DataFrame: +def load_csv(file_path: str, required_columns: list, nrows: int = None) -> pd.DataFrame: """ Load a CSV file and validate that it contains the required columns Args: file_path (str): Path to the CSV file required_columns (list): List of required column names - + nrows (int): Number of rows to read from the CSV file Returns: pd.DataFrame """ - df = pd.read_csv(file_path) + if nrows is not None: + logging.info(f"Test mode enabled: Reading first {nrows} rows of {file_path}") + + df = pd.read_csv(file_path, nrows=nrows) # Remove trailing whitespace from column names df.columns = df.columns.str.strip() @@ -147,8 +151,6 @@ def load_csv(file_path: str, required_columns: list) -> pd.DataFrame: async def main(): - # TODO: Add test evaluation argument to run on the first 10 rows of the dataset file - parser = argparse.ArgumentParser() parser.add_argument( "--experiments", "-e", required=True, help="Path to experiments CSV file" @@ -159,34 +161,35 @@ async def main(): parser.add_argument( "--results", "-r", required=True, help="Path to results CSV file" ) + parser.add_argument( + "--test", "-t", type=int, help="Run evaluation on first n rows of dataset only" + ) args = parser.parse_args() + # Load the experiment DataFrame df_experiment = load_csv( args.experiments, required_columns=["MODEL", "INSTRUCTIONS"] ) - # Check if all models are supported by ModelFactory - if not all( - model in ModelFactory.HANDLERS.keys() - for model in df_experiment["MODEL"].unique() - ): - raise ValueError( - f"Unsupported model(s) found: {set(df_experiment['MODEL'].unique()) - set(ModelFactory.HANDLERS.keys())}" - ) - df_dataset = load_csv(args.dataset, required_columns=["INPUT"]) + + # Load the dataset DataFrame + df_dataset = load_csv(args.dataset, required_columns=["INPUT"], nrows=args.test) # Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames df_in = df_experiment.merge(df_dataset, how="cross") # Evaluate each row in the input DataFrame concurrently logging.info(f"Starting evaluation of {len(df_in)} rows") + start_time = time.time() tasks = [ evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT) for row in df_in.itertuples(index=False) ] results = await asyncio.gather(*tasks) - logging.info(f"Completed evaluation of {len(results)} rows") + end_time = time.time() + duration = end_time - start_time + logging.info(f"Completed evaluation of {len(results)} rows in {duration} seconds") df_evals = pd.concat(results, axis=0, ignore_index=True) From f4590903ba18e06b3e391e2be1108b03510b3d37 Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Tue, 29 Jul 2025 14:22:41 -0400 Subject: [PATCH 6/6] F401 imported but unused --- server/api/views/medRules/serializers.py | 28 ++++++++++++++---------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/server/api/views/medRules/serializers.py b/server/api/views/medRules/serializers.py index 7638013c..df5e3663 100644 --- a/server/api/views/medRules/serializers.py +++ b/server/api/views/medRules/serializers.py @@ -1,6 +1,5 @@ from rest_framework import serializers from ...models.model_medRule import MedRule, MedRuleSource -from ..listMeds.models import Medication from ..listMeds.serializers import MedicationSerializer from ...models.model_embeddings import Embeddings @@ -8,7 +7,7 @@ class EmbeddingsSerializer(serializers.ModelSerializer): class Meta: model = Embeddings - fields = ['guid', 'name', 'text', 'page_num', 'chunk_number'] + fields = ["guid", "name", "text", "page_num", "chunk_number"] class MedicationWithSourcesSerializer(serializers.Serializer): @@ -22,28 +21,33 @@ class MedRuleSerializer(serializers.ModelSerializer): class Meta: model = MedRule fields = [ - 'id', 'rule_type', 'history_type', 'reason', 'label', 'explanation', - 'medication_sources' + "id", + "rule_type", + "history_type", + "reason", + "label", + "explanation", + "medication_sources", ] def get_medication_sources(self, obj): - - medrule_sources = MedRuleSource.objects.filter( - medrule=obj).select_related('medication', 'embedding') + medrule_sources = MedRuleSource.objects.filter(medrule=obj).select_related( + "medication", "embedding" + ) med_to_sources = {} for ms in medrule_sources: if ms.medication.id not in med_to_sources: med_to_sources[ms.medication.id] = { - 'medication': ms.medication, - 'sources': [] + "medication": ms.medication, + "sources": [], } - med_to_sources[ms.medication.id]['sources'].append(ms.embedding) + med_to_sources[ms.medication.id]["sources"].append(ms.embedding) return [ { - 'medication': MedicationSerializer(data['medication']).data, - 'sources': EmbeddingsSerializer(data['sources'], many=True).data + "medication": MedicationSerializer(data["medication"]).data, + "sources": EmbeddingsSerializer(data["sources"], many=True).data, } for data in med_to_sources.values() ]