@@ -9,9 +9,10 @@ It supports batch evalaution via a configuration CSV and produces a detailed met
99
1010### Usage
1111
12- This script evaluates LLM outputs using the ` lighteval ` library: https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
12+ This script evaluates LLM outputs using the ` lighteval ` library:
13+ https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
1314
14- Ensure you have the ` lighteval ` library and any model SDKs (e.g., OpenAI, Anthropic ) configured properly.
15+ Ensure you have the ` lighteval ` library and any model SDKs (e.g., OpenAI) configured properly.
1516
1617
1718``` bash
@@ -21,17 +22,175 @@ python evals.py --config path/to/config.csv --reference path/to/reference.csv --
2122The arguments to the script are:
2223
2324- Path to the config CSV file: Must include the columns "Model Name" and "Query"
25+
26+ ```
27+ import pandas as pd
28+
29+ # Define the data
30+ data = [
31+
32+ {
33+ "Model Name": "GPT_4O_MINI",
34+ "Query": """
35+ You're analyzing medical text from multiple sources. Each chunk is labeled [chunk-X].
36+
37+ Act as a seasoned physician or medical professional who treats patients with bipolar disorder.
38+
39+ Identify rules for medication inclusion or exclusion based on medical history or concerns.
40+
41+ For each rule you find, return a JSON object using the following format:
42+
43+ {
44+ "rule": "<condition or concern>",
45+ "type": "INCLUDE" or "EXCLUDE",
46+ "reason": "<short explanation for why this rule applies>",
47+ "medications": ["<medication 1>", "<medication 2>", ...],
48+ "source": "<chunk-X>"
49+ }
50+
51+ Only include rules that are explicitly stated or strongly implied in the chunk.
52+
53+ Only use the chunks provided. If no rule is found in a chunk, skip it.
54+
55+ Return the entire output as a JSON array.
56+ """
57+ },
58+
59+ {
60+ "Model Name": "GPT_41_NANO",
61+ "Query": """
62+
63+ # Role and Objective
64+
65+ - You are a seasoned physician or medical professional who is developing a bipolar disorder treatment algorithim
66+
67+ - You are extracting bipolar medication decision points from a research paper that is chunked into multiple parts each labeled with an ID
68+
69+ # Instructions
70+
71+ - Identify decision points for bipolar medications
72+
73+ - For each decision point you find, return a JSON object using the following format:
74+
75+ {
76+ "criterion": "<condition or concern>",
77+ "decision": "INCLUDE" or "EXCLUDE",
78+ "medications": ["<medication 1>", "<medication 2>", ...],
79+ "reason": "<short explanation for why this criterion applies>",
80+ "sources": ["<ID-X>"]
81+ }
82+
83+
84+ - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge
85+
86+ # Output Format
87+
88+ - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array
89+
90+ # Example
91+
92+ [
93+ {
94+ "criterion": "History of suicide attempts",
95+ "decision": "INCLUDE",
96+ "medications": ["Lithium"],
97+ "reason": "Lithium is the only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder",
98+ "sources": ["ID-0"]
99+ },
100+ {
101+ "criterion": "Weight gain concerns",
102+ "decision": "EXCLUDE",
103+ "medications": ["Quetiapine", "Aripiprazole", "Olanzapine", "Risperidone"],
104+ "reason": "Seroquel, Risperdal, Abilify, and Zyprexa are known for causing weight gain",
105+ "sources": ["ID-0", "ID-1", "ID-2"]
106+ }
107+ ]
108+
109+ """
110+
111+ },
112+ ]
113+
114+ # Create DataFrame from records
115+ df = pd.DataFrame.from_records(data)
116+
117+ # Write to CSV
118+ df.to_csv("~/Desktop/evals_config.csv", index=False)
119+ ```
120+
121+
24122- Path to the reference CSV file: Must include the columns "Context" and "Reference"
123+
124+ ```
125+ from sqlalchemy import create_engine
126+ import pandas as pd
127+
128+ engine = create_engine("postgresql+psycopg2://balancer:balancer@localhost:5433/balancer_dev")
129+ # Filter out papers that shouldn't be used from local database
130+ query = "SELECT * FROM api_embeddings WHERE date_of_upload > '2025-03-14';"
131+ df = pd.read_sql(query, engine)
132+
133+ df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1)
134+ # Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining
135+ df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
136+ df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index()
137+ df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'})
138+ df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False)
139+ ```
140+
25141- Path where the evaluation resuls will be saved
26142
143+ import pandas as pd
144+ import matplotlib.pyplot as plt
145+ import numpy as np
146+
147+
148+ df = pd.read_csv("~ /Desktop/evals_out-20250702.csv")
149+
150+ # Define the metrics of interest
151+ extractiveness_cols = [ 'Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression']
152+ token_cols = [ 'Input Token Usage', 'Output Token Usage']
153+ other_metrics = [ 'Cost (USD)', 'Duration (s)']
154+ all_metrics = extractiveness_cols + token_cols + other_metrics
155+
156+ # Metric histograms by model
157+ plt.style.use('default')
158+ fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics)))
159+
160+ models = df[ 'Model Name'] .unique()
161+ colors = plt.cm.Set3(np.linspace(0, 1, len(models)))
162+
163+ for i, metric in enumerate(all_metrics):
164+ ax = axes[ i] if len(all_metrics) > 1 else axes
165+
166+ # Create histogram for each model
167+ for j, model in enumerate(models):
168+ model_data = df[ df[ 'Model Name'] == model] [ metric ]
169+ ax.hist(model_data, alpha=0.7, label=model, bins=min(8, len(model_data)),
170+ color=colors[ j] , edgecolor='black', linewidth=0.5)
171+
172+ ax.set_title(f'{metric} Distribution by Model', fontsize=14, fontweight='bold')
173+ ax.set_xlabel(metric, fontsize=12)
174+ ax.set_ylabel('Frequency', fontsize=12)
175+ ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
176+ ax.grid(True, alpha=0.3)
177+
178+ plt.tight_layout()
179+ plt.show()
180+
181+ #TODO: Compute count, min, quantiles and max by model
182+ #TODO: Calculate efficiency metrics: Totel Token Usage, Cost per Token, Tokens per Second, Cost per Second
183+
184+
185+
27186
28187The script outputs a CSV with the following columns:
29188
30189* Evaluates LLM outputs for:
31190
32- * Extractiveness Coverage
33- * Extractiveness Density
34- * Extractiveness Compression
191+ * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article
192+ * Extractiveness Density: Average length of the extractive fragement to which each word in the summary belongs
193+ * Extractiveness Compression: Word ratio between the article and the summary
35194
36195* Computes:
37196
0 commit comments