Skip to content

Commit fa7103c

Browse files
authored
Merge pull request #290 from sahilds1/262-extract-meds-rules
[#262] Add med rule extraction tests and model evaluations
2 parents b1fb75c + 6a6eb7a commit fa7103c

File tree

5 files changed

+518
-24
lines changed

5 files changed

+518
-24
lines changed

evaluation/README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
2+
# Evaluations
3+
4+
## LLM Output Evaluator
5+
6+
The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost.
7+
8+
It supports batch evalaution via a configuration CSV and produces a detailed metrics report in CSV format.
9+
10+
### Usage
11+
12+
This script evaluates LLM outputs using the `lighteval` library: https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
13+
14+
Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI, Anthropic) configured properly.
15+
16+
17+
```bash
18+
python evals.py --config path/to/config.csv --reference path/to/reference.csv --output path/to/results.csv
19+
```
20+
21+
The arguments to the script are:
22+
23+
- Path to the config CSV file: Must include the columns "Model Name" and "Query"
24+
- Path to the reference CSV file: Must include the columns "Context" and "Reference"
25+
- Path where the evaluation resuls will be saved
26+
27+
28+
The script outputs a CSV with the following columns:
29+
30+
* Evaluates LLM outputs for:
31+
32+
* Extractiveness Coverage
33+
* Extractiveness Density
34+
* Extractiveness Compression
35+
36+
* Computes:
37+
38+
* Token usage (input/output)
39+
* Estimated cost in USD
40+
* Duration (in seconds)

evaluation/evals.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
"""
2+
Evaluate LLM outputs using multiple metrics and compute associated costs
3+
"""
4+
5+
# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
6+
7+
import sys
8+
import os
9+
10+
# Ensure the parent directory is in the path to import ModelFactory
11+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
12+
13+
import argparse
14+
import logging
15+
16+
import pandas as pd
17+
from lighteval.tasks.requests import Doc
18+
from lighteval.metrics.metrics_sample import Extractiveness
19+
20+
from server.api.services.llm_services import ModelFactory
21+
22+
logging.basicConfig(
23+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
24+
)
25+
26+
27+
def evaluate_response(
28+
model_name: str, query: str, context: str, reference: str
29+
) -> pd.DataFrame:
30+
"""
31+
Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost
32+
33+
Args:
34+
model_name (str): The name of the model to be used for evaluation.
35+
query (str): The user query to be processed.
36+
context (str): The context or document content to be used.
37+
reference (str): The reference text for comparison (not used in this function, but can be used for further evaluations).
38+
39+
Returns:
40+
pd.DataFrame: A DataFrame containing the output text, extractiveness metrics, token usage, cost, and duration.
41+
"""
42+
43+
handler = ModelFactory.get_handler(model_name)
44+
45+
# TODO: Add error handling for unsupported models
46+
47+
output_text, token_usage, pricing, duration = handler.handle_request(query, context)
48+
49+
doc = Doc(query="", choices=[], gold_index=0, specific={"text": context})
50+
extractiveness = Extractiveness().compute(
51+
formatted_doc=doc, predictions=[output_text]
52+
)
53+
54+
input_cost_dollars = (pricing["input"] / 1000000) * token_usage.input_tokens
55+
output_cost_dollars = (pricing["output"] / 1000000) * token_usage.output_tokens
56+
57+
total_cost_dollars = input_cost_dollars + output_cost_dollars
58+
59+
return pd.DataFrame(
60+
[
61+
{
62+
"Output Text": output_text,
63+
"Extractiveness Coverage": extractiveness["summarization_coverage"],
64+
"Extractiveness Density": extractiveness["summarization_density"],
65+
"Extractiveness Compression": extractiveness[
66+
"summarization_compression"
67+
],
68+
"Input Token Usage": token_usage.input_tokens,
69+
"Output Token Usage": token_usage.output_tokens,
70+
"Cost (USD)": total_cost_dollars,
71+
"Duration (s)": duration,
72+
}
73+
]
74+
)
75+
76+
77+
if __name__ == "__main__":
78+
# TODO: Add CLI argument to specify the metrics to be computed
79+
parser = argparse.ArgumentParser(
80+
description="Evaluate LLM outputs using multiple metrics and compute associated costs"
81+
)
82+
parser.add_argument("--config", "-c", required=True, help="Path to config CSV file")
83+
parser.add_argument(
84+
"--reference", "-r", required=True, help="Path to reference CSV file"
85+
)
86+
parser.add_argument("--output", "-o", required=True, help="Path to output CSV file")
87+
88+
args = parser.parse_args()
89+
90+
df_config = pd.read_csv(args.config)
91+
logging.info(f"Config DataFrame shape: {df_config.shape}")
92+
logging.info(f"Config DataFrame columns: {df_config.columns.tolist()}")
93+
94+
# Remove the trailing whitespace from column names
95+
df_config.columns = df_config.columns.str.strip()
96+
97+
# Check if the required columns are present
98+
required_columns = ["Model Name", "Query"]
99+
if not all(col in df_config.columns for col in required_columns):
100+
raise ValueError(
101+
f"Config DataFrame must contain the following columns: {required_columns}"
102+
)
103+
104+
# Check if all models in the config are supported by ModelFactory
105+
if not all(
106+
model in ModelFactory.HANDLERS.keys()
107+
for model in df_config["Model Name"].unique()
108+
):
109+
raise ValueError(
110+
f"Unsupported model(s) found in config: {set(df_config['Model Name'].unique()) - set(ModelFactory.HANDLERS.keys())}"
111+
)
112+
113+
df_reference = pd.read_csv(args.reference)
114+
logging.info(f"Reference DataFrame shape: {df_reference.shape}")
115+
logging.info(f"Reference DataFrame columns: {df_reference.columns.tolist()}")
116+
117+
# Remove the trailing whitespace from column names
118+
df_reference.columns = df_reference.columns.str.strip()
119+
# Check if the required columns are present
120+
required_columns = ["Context", "Reference"]
121+
if not all(col in df_reference.columns for col in required_columns):
122+
raise ValueError(
123+
f"Reference DataFrame must contain the following columns: {required_columns}"
124+
)
125+
126+
# Cross join the config and reference DataFrames
127+
df_in = df_config.merge(df_reference, how="cross")
128+
129+
# TODO: Parallelize the evaluation process for each row in df_in using concurrent.futures or similar libraries
130+
df_evals = pd.DataFrame()
131+
for index, row in df_in.iterrows():
132+
df_evals = pd.concat(
133+
[
134+
df_evals,
135+
evaluate_response(
136+
row["Model Name"], row["Query"], row["Context"], row["Reference"]
137+
),
138+
],
139+
axis=0,
140+
)
141+
142+
logging.info(f"Processed row {index + 1}/{len(df_in)}")
143+
144+
# Concatenate the input and evaluations DataFrames
145+
146+
df_out = pd.concat(
147+
[df_in.reset_index(drop=True), df_evals.reset_index(drop=True)], axis=1
148+
)
149+
150+
df_out.to_csv(args.output, index=False)
151+
logging.info(f"Output DataFrame shape: {df_out.shape}")
152+
logging.info(f"Results saved to {args.output}")
153+
logging.info("Evaluation completed successfully.")

evaluation/test_evals.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
2+
from unittest.mock import patch, MagicMock
3+
4+
import pytest
5+
import pandas as pd
6+
7+
from evals import evaluate_response
8+
9+
class MockTokenUsage:
10+
def __init__(self, input_tokens, output_tokens):
11+
self.input_tokens = input_tokens
12+
self.output_tokens = output_tokens
13+
14+
@patch("evals.ModelFactory.get_handler")
15+
@patch("evals.Extractiveness.compute")
16+
def test_evaluate_response(mock_extractiveness_compute, mock_get_handler):
17+
18+
# Mock BaseModelHandler
19+
mock_handler = MagicMock()
20+
mock_handler.handle_request.return_value = (
21+
"This is a summary.",
22+
MockTokenUsage(input_tokens=100, output_tokens=50),
23+
{"input": 15.0, "output": 30.0}, # $15 and $30 per 1M tokens
24+
1.23, # duration
25+
)
26+
27+
mock_get_handler.return_value = mock_handler
28+
29+
mock_extractiveness_compute.return_value = {
30+
"summarization_coverage": 0.8,
31+
"summarization_density": 1.5,
32+
"summarization_compression": 2.0,
33+
}
34+
35+
df = evaluate_response(
36+
model_name="mock-model",
37+
query="What is the summary?",
38+
context="This is a long article about something important.",
39+
reference="This is a reference summary.",
40+
)
41+
42+
assert isinstance(df, pd.DataFrame)
43+
assert df.shape == (1, 8)
44+
assert df["Output Text"].iloc[0] == "This is a summary."
45+
assert df["Extractiveness Coverage"].iloc[0] == 0.8
46+
assert df["Extractiveness Density"].iloc[0] == 1.5
47+
assert df["Extractiveness Compression"].iloc[0] == 2.0
48+
assert df["Input Token Usage"].iloc[0] == 100
49+
assert df["Output Token Usage"].iloc[0] == 50
50+
51+
expected_cost = (15.0 / 1_000_000) * 100 + (30.0 / 1_000_000) * 50
52+
assert pytest.approx(df["Cost (USD)"].iloc[0], rel=1e-4) == expected_cost
53+
assert pytest.approx(df["Duration (s)"].iloc[0], rel=1e-4) == 1.23

0 commit comments

Comments
 (0)