Skip to content

Commit 6c592be

Browse files
committed
Update evaluation README with example scripts and remove obsolete Claude API client code from service module.
1 parent e5d7ac3 commit 6c592be

File tree

2 files changed

+210
-146
lines changed

2 files changed

+210
-146
lines changed

evaluation/README.md

Lines changed: 164 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@ It supports batch evalaution via a configuration CSV and produces a detailed met
99

1010
### Usage
1111

12-
This script evaluates LLM outputs using the `lighteval` library: https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
12+
This script evaluates LLM outputs using the `lighteval` library:
13+
https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
1314

14-
Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI, Anthropic) configured properly.
15+
Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI) configured properly.
1516

1617

1718
```bash
@@ -21,17 +22,175 @@ python evals.py --config path/to/config.csv --reference path/to/reference.csv --
2122
The arguments to the script are:
2223

2324
- Path to the config CSV file: Must include the columns "Model Name" and "Query"
25+
26+
```
27+
import pandas as pd
28+
29+
# Define the data
30+
data = [
31+
32+
{
33+
"Model Name": "GPT_4O_MINI",
34+
"Query": """
35+
You're analyzing medical text from multiple sources. Each chunk is labeled [chunk-X].
36+
37+
Act as a seasoned physician or medical professional who treats patients with bipolar disorder.
38+
39+
Identify rules for medication inclusion or exclusion based on medical history or concerns.
40+
41+
For each rule you find, return a JSON object using the following format:
42+
43+
{
44+
"rule": "<condition or concern>",
45+
"type": "INCLUDE" or "EXCLUDE",
46+
"reason": "<short explanation for why this rule applies>",
47+
"medications": ["<medication 1>", "<medication 2>", ...],
48+
"source": "<chunk-X>"
49+
}
50+
51+
Only include rules that are explicitly stated or strongly implied in the chunk.
52+
53+
Only use the chunks provided. If no rule is found in a chunk, skip it.
54+
55+
Return the entire output as a JSON array.
56+
"""
57+
},
58+
59+
{
60+
"Model Name": "GPT_41_NANO",
61+
"Query": """
62+
63+
# Role and Objective
64+
65+
- You are a seasoned physician or medical professional who is developing a bipolar disorder treatment algorithim
66+
67+
- You are extracting bipolar medication decision points from a research paper that is chunked into multiple parts each labeled with an ID
68+
69+
# Instructions
70+
71+
- Identify decision points for bipolar medications
72+
73+
- For each decision point you find, return a JSON object using the following format:
74+
75+
{
76+
"criterion": "<condition or concern>",
77+
"decision": "INCLUDE" or "EXCLUDE",
78+
"medications": ["<medication 1>", "<medication 2>", ...],
79+
"reason": "<short explanation for why this criterion applies>",
80+
"sources": ["<ID-X>"]
81+
}
82+
83+
84+
- Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge
85+
86+
# Output Format
87+
88+
- Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array
89+
90+
# Example
91+
92+
[
93+
{
94+
"criterion": "History of suicide attempts",
95+
"decision": "INCLUDE",
96+
"medications": ["Lithium"],
97+
"reason": "Lithium is the only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder",
98+
"sources": ["ID-0"]
99+
},
100+
{
101+
"criterion": "Weight gain concerns",
102+
"decision": "EXCLUDE",
103+
"medications": ["Quetiapine", "Aripiprazole", "Olanzapine", "Risperidone"],
104+
"reason": "Seroquel, Risperdal, Abilify, and Zyprexa are known for causing weight gain",
105+
"sources": ["ID-0", "ID-1", "ID-2"]
106+
}
107+
]
108+
109+
"""
110+
111+
},
112+
]
113+
114+
# Create DataFrame from records
115+
df = pd.DataFrame.from_records(data)
116+
117+
# Write to CSV
118+
df.to_csv("~/Desktop/evals_config.csv", index=False)
119+
```
120+
121+
24122
- Path to the reference CSV file: Must include the columns "Context" and "Reference"
123+
124+
```
125+
from sqlalchemy import create_engine
126+
import pandas as pd
127+
128+
engine = create_engine("postgresql+psycopg2://balancer:balancer@localhost:5433/balancer_dev")
129+
# Filter out papers that shouldn't be used from local database
130+
query = "SELECT * FROM api_embeddings WHERE date_of_upload > '2025-03-14';"
131+
df = pd.read_sql(query, engine)
132+
133+
df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1)
134+
# Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining
135+
df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
136+
df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index()
137+
df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'})
138+
df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False)
139+
```
140+
25141
- Path where the evaluation resuls will be saved
26142

143+
import pandas as pd
144+
import matplotlib.pyplot as plt
145+
import numpy as np
146+
147+
148+
df = pd.read_csv("~/Desktop/evals_out-20250702.csv")
149+
150+
# Define the metrics of interest
151+
extractiveness_cols = ['Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression']
152+
token_cols = ['Input Token Usage', 'Output Token Usage']
153+
other_metrics = ['Cost (USD)', 'Duration (s)']
154+
all_metrics = extractiveness_cols + token_cols + other_metrics
155+
156+
# Metric histograms by model
157+
plt.style.use('default')
158+
fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics)))
159+
160+
models = df['Model Name'].unique()
161+
colors = plt.cm.Set3(np.linspace(0, 1, len(models)))
162+
163+
for i, metric in enumerate(all_metrics):
164+
ax = axes[i] if len(all_metrics) > 1 else axes
165+
166+
# Create histogram for each model
167+
for j, model in enumerate(models):
168+
model_data = df[df['Model Name'] == model][metric]
169+
ax.hist(model_data, alpha=0.7, label=model, bins=min(8, len(model_data)),
170+
color=colors[j], edgecolor='black', linewidth=0.5)
171+
172+
ax.set_title(f'{metric} Distribution by Model', fontsize=14, fontweight='bold')
173+
ax.set_xlabel(metric, fontsize=12)
174+
ax.set_ylabel('Frequency', fontsize=12)
175+
ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
176+
ax.grid(True, alpha=0.3)
177+
178+
plt.tight_layout()
179+
plt.show()
180+
181+
#TODO: Compute count, min, quantiles and max by model
182+
#TODO: Calculate efficiency metrics: Totel Token Usage, Cost per Token, Tokens per Second, Cost per Second
183+
184+
185+
27186

28187
The script outputs a CSV with the following columns:
29188

30189
* Evaluates LLM outputs for:
31190

32-
* Extractiveness Coverage
33-
* Extractiveness Density
34-
* Extractiveness Compression
191+
* Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article
192+
* Extractiveness Density: Average length of the extractive fragement to which each word in the summary belongs
193+
* Extractiveness Compression: Word ratio between the article and the summary
35194

36195
* Computes:
37196

0 commit comments

Comments
 (0)