From 3a174576d21156f99f79e0bde40c9d48360399bf Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Wed, 18 Jun 2025 17:47:22 -0400 Subject: [PATCH 01/15] GPT41 Nano prompting --- server/api/services/llm_services.py | 323 ++++++++++++++++++++++ server/api/views/text_extraction/views.py | 25 +- 2 files changed, 343 insertions(+), 5 deletions(-) create mode 100644 server/api/services/llm_services.py diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py new file mode 100644 index 00000000..7ab9c176 --- /dev/null +++ b/server/api/services/llm_services.py @@ -0,0 +1,323 @@ +""" +This module contains functions to interact with different AI models +""" + +import os +import time +import logging +from abc import ABC, abstractmethod + +import anthropic +import openai + + +class BaseModelHandler(ABC): + @abstractmethod + def handle_request( + self, query: str, context: str + ) -> tuple[str, dict[str, int], dict[str, float], float]: + pass + + +class ClaudeHaiku35CitationsHandler(BaseModelHandler): + MODEL = "claude-3-5-haiku-20241022" + # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing + PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.80, "output": 4.00} + + def __init__(self) -> None: + self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) + + def handle_request( + self, query: str, context: str + ) -> tuple[str, dict[str, int], dict[str, float], float]: + """ + Handles the request to the Claude Haiku 3.5 model with citations enabled + + Args: + query: The user query to be processed + context: The context or document content to be used for citations + + """ + + start_time = time.time() + # TODO: Add error handling for API requests and invalid responses + message = self.client.messages.create( + model=self.MODEL, + max_tokens=1024, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": query}, + { + "type": "document", + "source": {"type": "content", "content": context}, + "citations": {"enabled": True}, + }, + ], + } + ], + ) + duration = time.time() - start_time + + # Response Structure: https://docs.anthropic.com/en/docs/build-with-claude/citations#response-structure + + text = [] + cited_text = [] + for content in message.to_dict()["content"]: + text.append(content["text"]) + if "citations" in content.keys(): + text.append( + " ".join( + [ + f"<{citation['start_block_index']} - {citation['end_block_index']}>" + for citation in content["citations"] + ] + ) + ) + cited_text.append( + " ".join( + [ + f"<{citation['start_block_index']} - {citation['end_block_index']}> {citation['cited_text']}" + for citation in content["citations"] + ] + ) + ) + + full_text = " ".join(text) + + return ( + full_text, + message.usage, + self.PRICING_DOLLARS_PER_MILLION_TOKENS, + duration, + ) + + +class ClaudeHaiku3Handler(BaseModelHandler): + MODEL = "claude-3-haiku-20240307" + # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing + PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.25, "output": 1.25} + + def __init__(self) -> None: + self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) + + def handle_request( + self, query: str, context: str + ) -> tuple[str, dict[str, int], dict[str, float], float]: + """ + Handles the request to the Claude Haiku 3 model with citations disabled + + Args: + query: The user query to be processed + context: The context or document content to be used + + """ + + start_time = time.time() + # TODO: Add error handling for API requests and invalid responses + message = self.client.messages.create( + model=self.MODEL, + max_tokens=1024, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": query}, + { + "type": "document", + "source": {"type": "content", "content": context}, + "citations": {"enabled": False}, + }, + ], + } + ], + ) + duration = time.time() - start_time + + text = [] + for content in message.to_dict()["content"]: + text.append(content["text"]) + + full_text = " ".join(text) + + return ( + full_text, + message.usage, + self.PRICING_DOLLARS_PER_MILLION_TOKENS, + duration, + ) + + +class GPT4OMiniHandler(BaseModelHandler): + MODEL = "gpt-4o-mini" + # Model Pricing: https://platform.openai.com/docs/pricing + PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.15, "output": 0.60} + + def __init__(self) -> None: + self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + + def handle_request( + self, query: str, context: str + ) -> tuple[str, dict[str, int], dict[str, float], float]: + """ + Handles the request to the GPT-4o Mini model + + Args: + query: The user query to be processed + context: The context or document content to be used + + """ + start_time = time.time() + # TODO: Add error handling for API requests and invalid responses + response = self.client.responses.create( + model=self.MODEL, + instructions=query, + input=context, + ) + duration = time.time() - start_time + + return ( + response.output_text, + response.usage, + self.PRICING_DOLLARS_PER_MILLION_TOKENS, + duration, + ) + + +class GPT41NanoHandler(BaseModelHandler): + MODEL = "gpt-4.1-nano" + # Model Pricing: https://platform.openai.com/docs/pricing + PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.10, "output": 0.40} + + def __init__(self) -> None: + self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + + def handle_request( + self, query: str, context: str + ) -> tuple[str, dict[str, int], dict[str, float], float]: + """ + Handles the request to the GPT-4.1 Nano model + + Args: + query: The user query to be processed + context: The context or document content to be used + + """ + start_time = time.time() + # TODO: Add error handling for API requests and invalid responses + + # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide + + # Long context performance can degrade as more items are required to be retrieved, + # or perform complex reasoning that requires knowledge of the state of the entire context + + """ + + # Role and Objective + + - You are a seasoned physician or medical professional who treats patients with bipolar disorder + - You are analyzing medical research by processing peer-reviewed papers to extract key details + + # Instructions + + - Identify rules for medication inclusion or exclusion based on medical history or concerns + + - Only use the documents in the provided External Context to answer the User Query. + If you don't know the answer based on this context, you must respond + "I don't have the information needed to answer that", even if a user insists on you answering the question. + + - Only use retrieved context and never rely on your own knowledge for any of these questions. + + - Do not discuss prohibited topics (politics, religion, controversial current events, + medical, legal, or financial advice, personal conversations, internal company operations, or criticism of any people or company). + + - Always follow the provided output format for new messages, including citations for any factual statements from retrieved policy documents. + + # Output Format + + The rule is history of suicide attempts. The type of rule is "INCLUDE". The reason is lithium is the + only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder. + The medications for this rule are lithium. + + The rule is weight gain concerns. The type of rule is "EXCLUDE". The reason is Seroquel, Risperdal, Abilify, and + Zyprexa are known for causing weight gain. The medications for this rule are Quetiapine, Aripiprazole, Olanzapine, Risperidone + + For each rule you find, return a JSON object using the following format: + + { + "rule": "", + "type": "INCLUDE" or "EXCLUDE", + "reason": "", + "medications": ["", "", ...], + "source": "" + } + + - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). + Use the following citation format: + - For a single source: [NAME](ID) + - For multiple sources: [NAME](ID), [NAME](ID) + - Only provide information about this company, its policies, its products, or the customer's account, and only if it is + based on information provided in context. Do not answer questions outside this scope. + + + # Examples + + + # Context + + ID: 1 | TITLE: The Fox | CONTENT: The quick brown fox jumps over the lazy dog + + # Final instructions and prompt to think step by step + + - Identify rules for medication inclusion or exclusion based on medical history or concerns + + - Only use the documents in the provided External Context to answer the User Query. + If you don't know the answer based on this context, you must respond + "I don't have the information needed to answer that", even if a user insists on you answering the question. + + """ + + + + response = self.client.responses.create( + model=self.MODEL, + instructions=query, + input=context, + ) + duration = time.time() - start_time + + return ( + response.output_text, + response.usage, + self.PRICING_DOLLARS_PER_MILLION_TOKENS, + duration, + ) + + +class ModelFactory: + HANDLERS = { + "CLAUDE_HAIKU_3_5_CITATIONS": ClaudeHaiku35CitationsHandler, + "CLAUDE_HAIKU_3": ClaudeHaiku3Handler, + "GPT_4O_MINI": GPT4OMiniHandler, + "GPT_41_NANO": GPT41NanoHandler, + } + + # HANDLERS doesn't vary per instance so we can use a class method + @classmethod + def get_handler(cls, model_name: str) -> BaseModelHandler | None: + """ + Factory method to get the appropriate model handler based on the model name + + Args: + model_name (str): The name of the model for which to get the handler. + Returns: + BaseModelHandler: An instance of the appropriate model handler class. + """ + + handler_class = cls.HANDLERS.get(model_name) + if handler_class: + return handler_class() + else: + logging.error(f"Unsupported model: {model_name}") + return None diff --git a/server/api/views/text_extraction/views.py b/server/api/views/text_extraction/views.py index 92b34c9c..79b916ab 100644 --- a/server/api/views/text_extraction/views.py +++ b/server/api/views/text_extraction/views.py @@ -1,5 +1,7 @@ import os -from ...services.openai_services import openAIServices +import json +import re + from rest_framework.views import APIView from rest_framework.permissions import IsAuthenticated from rest_framework.response import Response @@ -7,16 +9,28 @@ from django.utils.decorators import method_decorator from django.views.decorators.csrf import csrf_exempt import anthropic -import json -import re + +from ...services.openai_services import openAIServices from api.models.model_embeddings import Embeddings -# TODO: Add docstrings and type hints -def anthropic_citations(client, content_chunks, user_prompt): +def anthropic_citations(client: anthropic.Client, user_prompt: str, content_chunks: list) -> tuple: """ + Sends a message to Anthropic Citations and extract and format the response + + Parameters + ---------- + client: An instance of the Anthropic API client used to make the request + user_prompt: The user's question or instruction to be processed by the model + content_chunks: A list of text chunks that provide context for the model to use during generation + + Returns + ------- + tuple + """ + message = client.messages.create( model="claude-3-5-haiku-20241022", max_tokens=1024, @@ -93,6 +107,7 @@ def get(self, request): query = Embeddings.objects.filter(upload_file__guid=guid) + # TODO: Format into the Anthropic API"s expected input format in the anthropic_citations function chunks = [{"type": "text", "text": chunk.text} for chunk in query] texts, cited_texts = anthropic_citations( From d47606726c9628eeb01a774b3c374c6e808f3efa Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Thu, 19 Jun 2025 13:26:11 -0400 Subject: [PATCH 02/15] HOTFIX ModuleNotFoundError --- server/api/views/uploadFile/title.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/api/views/uploadFile/title.py b/server/api/views/uploadFile/title.py index 637607a2..93453607 100644 --- a/server/api/views/uploadFile/title.py +++ b/server/api/views/uploadFile/title.py @@ -2,7 +2,7 @@ import fitz -from server.api.services.openai_services import openAIServices +from ...services.openai_services import openAIServices # regular expression to match common research white paper titles. Created by Chat-gpt # requires at least 3 words, no dates, no version numbers. From b18391980bc75b76572d94442cbfe87267df8e4c Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Thu, 19 Jun 2025 19:38:19 -0400 Subject: [PATCH 03/15] Refactor evaluation script and GPT-4.1 Nano handler for cleaner logic and improved defaults evaluation/evals.py: Removed unused Reference column from input requirements and function signature. Added TODOs to improve flexibility and support defaults for instructions. Suggested running the script with uv for dependency management. llm_services.py: Refactored GPT41NanoHandler: Moved long prompt instructions into a class-level INSTRUCTIONS string. Simplified and cleaned up the handle_request method. Added default fallback behavior: uses INSTRUCTIONS if no query is provided. Removed duplicate and commented-out prompt scaffolding. --- evaluation/evals.py | 9 ++- server/api/services/llm_services.py | 102 +++++++++------------------- 2 files changed, 37 insertions(+), 74 deletions(-) diff --git a/evaluation/evals.py b/evaluation/evals.py index f6e9bb3d..a263d3bc 100644 --- a/evaluation/evals.py +++ b/evaluation/evals.py @@ -2,6 +2,8 @@ Evaluate LLM outputs using multiple metrics and compute associated costs """ +#TODO: Run this script with uv to manage dependencies + # TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs import sys @@ -25,7 +27,7 @@ def evaluate_response( - model_name: str, query: str, context: str, reference: str + model_name: str, query: str, context: str ) -> pd.DataFrame: """ Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost @@ -95,6 +97,7 @@ def evaluate_response( df_config.columns = df_config.columns.str.strip() # Check if the required columns are present + # TODO: Make this more flexible by allowing the user to use default instructions required_columns = ["Model Name", "Query"] if not all(col in df_config.columns for col in required_columns): raise ValueError( @@ -117,7 +120,7 @@ def evaluate_response( # Remove the trailing whitespace from column names df_reference.columns = df_reference.columns.str.strip() # Check if the required columns are present - required_columns = ["Context", "Reference"] + required_columns = ["Context"] if not all(col in df_reference.columns for col in required_columns): raise ValueError( f"Reference DataFrame must contain the following columns: {required_columns}" @@ -133,7 +136,7 @@ def evaluate_response( [ df_evals, evaluate_response( - row["Model Name"], row["Query"], row["Context"], row["Reference"] + row["Model Name"], row["Query"], row["Context"] ), ], axis=0, diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py index 7ab9c176..049451f1 100644 --- a/server/api/services/llm_services.py +++ b/server/api/services/llm_services.py @@ -187,98 +187,58 @@ def handle_request( class GPT41NanoHandler(BaseModelHandler): MODEL = "gpt-4.1-nano" + # Model Pricing: https://platform.openai.com/docs/pricing PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.10, "output": 0.40} - def __init__(self) -> None: - self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) - - def handle_request( - self, query: str, context: str - ) -> tuple[str, dict[str, int], dict[str, float], float]: - """ - Handles the request to the GPT-4.1 Nano model - - Args: - query: The user query to be processed - context: The context or document content to be used - - """ - start_time = time.time() - # TODO: Add error handling for API requests and invalid responses - - # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide - - # Long context performance can degrade as more items are required to be retrieved, - # or perform complex reasoning that requires knowledge of the state of the entire context - - """ - - # Role and Objective - - - You are a seasoned physician or medical professional who treats patients with bipolar disorder - - You are analyzing medical research by processing peer-reviewed papers to extract key details - - # Instructions - - - Identify rules for medication inclusion or exclusion based on medical history or concerns - - - Only use the documents in the provided External Context to answer the User Query. - If you don't know the answer based on this context, you must respond - "I don't have the information needed to answer that", even if a user insists on you answering the question. - - - Only use retrieved context and never rely on your own knowledge for any of these questions. + # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide - - Do not discuss prohibited topics (politics, religion, controversial current events, - medical, legal, or financial advice, personal conversations, internal company operations, or criticism of any people or company). + # Long context performance can degrade as more items are required to be retrieved, + # or perform complex reasoning that requires knowledge of the state of the entire context - - Always follow the provided output format for new messages, including citations for any factual statements from retrieved policy documents. - - # Output Format - - The rule is history of suicide attempts. The type of rule is "INCLUDE". The reason is lithium is the - only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder. - The medications for this rule are lithium. + INSTRUCTIONS = """ - The rule is weight gain concerns. The type of rule is "EXCLUDE". The reason is Seroquel, Risperdal, Abilify, and - Zyprexa are known for causing weight gain. The medications for this rule are Quetiapine, Aripiprazole, Olanzapine, Risperidone + # Role and Objective - For each rule you find, return a JSON object using the following format: + - You are a seasoned physician or medical professional who treats patients with bipolar disorder + - You are analyzing medical research by processing peer-reviewed papers to extract key details - { - "rule": "", - "type": "INCLUDE" or "EXCLUDE", - "reason": "", - "medications": ["", "", ...], - "source": "" - } + # Instructions - - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). - Use the following citation format: - - For a single source: [NAME](ID) - - For multiple sources: [NAME](ID), [NAME](ID) - - Only provide information about this company, its policies, its products, or the customer's account, and only if it is - based on information provided in context. Do not answer questions outside this scope. + - Identify rules for medication inclusion or exclusion based on medical history or concerns + - Only use retrieved context and never rely on your own knowledge for any of these questions. + - Always follow the provided output format for new messages including citations for any factual statements - # Examples + # Output Format + - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). - # Context - ID: 1 | TITLE: The Fox | CONTENT: The quick brown fox jumps over the lazy dog + """ - # Final instructions and prompt to think step by step + def __init__(self) -> None: + self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) - - Identify rules for medication inclusion or exclusion based on medical history or concerns + def handle_request( + self, query: str, context: str + ) -> tuple[str, dict[str, int], dict[str, float], float]: + """ + Handles the request to the GPT-4.1 Nano model - - Only use the documents in the provided External Context to answer the User Query. - If you don't know the answer based on this context, you must respond - "I don't have the information needed to answer that", even if a user insists on you answering the question. + Args: + query: The user query to be processed + context: The context or document content to be used """ + # If no query is provided, use the default instructions + if not query: + query = self.INSTRUCTIONS + + start_time = time.time() + # TODO: Add error handling for API requests and invalid responses response = self.client.responses.create( model=self.MODEL, From 6c592be5f9430762bb19049da1cbc15650cfd489 Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Thu, 3 Jul 2025 13:21:19 -0400 Subject: [PATCH 04/15] Update evaluation README with example scripts and remove obsolete Claude API client code from service module. --- evaluation/README.md | 169 ++++++++++++++++++++++++- server/api/services/llm_services.py | 187 +++++++--------------------- 2 files changed, 210 insertions(+), 146 deletions(-) diff --git a/evaluation/README.md b/evaluation/README.md index a1d0ad70..5e088880 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -9,9 +9,10 @@ It supports batch evalaution via a configuration CSV and produces a detailed met ### Usage -This script evaluates LLM outputs using the `lighteval` library: https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks +This script evaluates LLM outputs using the `lighteval` library: +https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks -Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI, Anthropic) configured properly. +Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI) configured properly. ```bash @@ -21,17 +22,175 @@ python evals.py --config path/to/config.csv --reference path/to/reference.csv -- The arguments to the script are: - Path to the config CSV file: Must include the columns "Model Name" and "Query" + +``` +import pandas as pd + +# Define the data +data = [ + + { + "Model Name": "GPT_4O_MINI", + "Query": """ + You're analyzing medical text from multiple sources. Each chunk is labeled [chunk-X]. + + Act as a seasoned physician or medical professional who treats patients with bipolar disorder. + + Identify rules for medication inclusion or exclusion based on medical history or concerns. + + For each rule you find, return a JSON object using the following format: + + { + "rule": "", + "type": "INCLUDE" or "EXCLUDE", + "reason": "", + "medications": ["", "", ...], + "source": "" + } + + Only include rules that are explicitly stated or strongly implied in the chunk. + + Only use the chunks provided. If no rule is found in a chunk, skip it. + + Return the entire output as a JSON array. + """ + }, + + { + "Model Name": "GPT_41_NANO", + "Query": """ + + # Role and Objective + + - You are a seasoned physician or medical professional who is developing a bipolar disorder treatment algorithim + + - You are extracting bipolar medication decision points from a research paper that is chunked into multiple parts each labeled with an ID + + # Instructions + + - Identify decision points for bipolar medications + + - For each decision point you find, return a JSON object using the following format: + + { + "criterion": "", + "decision": "INCLUDE" or "EXCLUDE", + "medications": ["", "", ...], + "reason": "", + "sources": [""] + } + + + - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge + + # Output Format + + - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array + + # Example + + [ + { + "criterion": "History of suicide attempts", + "decision": "INCLUDE", + "medications": ["Lithium"], + "reason": "Lithium is the only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder", + "sources": ["ID-0"] + }, + { + "criterion": "Weight gain concerns", + "decision": "EXCLUDE", + "medications": ["Quetiapine", "Aripiprazole", "Olanzapine", "Risperidone"], + "reason": "Seroquel, Risperdal, Abilify, and Zyprexa are known for causing weight gain", + "sources": ["ID-0", "ID-1", "ID-2"] + } + ] + + """ + + }, +] + +# Create DataFrame from records +df = pd.DataFrame.from_records(data) + +# Write to CSV +df.to_csv("~/Desktop/evals_config.csv", index=False) +``` + + - Path to the reference CSV file: Must include the columns "Context" and "Reference" + +``` +from sqlalchemy import create_engine +import pandas as pd + +engine = create_engine("postgresql+psycopg2://balancer:balancer@localhost:5433/balancer_dev") +# Filter out papers that shouldn't be used from local database +query = "SELECT * FROM api_embeddings WHERE date_of_upload > '2025-03-14';" +df = pd.read_sql(query, engine) + +df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1) +# Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining +df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number']) +df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index() +df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'}) +df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False) +``` + - Path where the evaluation resuls will be saved +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + + +df = pd.read_csv("~/Desktop/evals_out-20250702.csv") + +# Define the metrics of interest +extractiveness_cols = ['Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression'] +token_cols = ['Input Token Usage', 'Output Token Usage'] +other_metrics = ['Cost (USD)', 'Duration (s)'] +all_metrics = extractiveness_cols + token_cols + other_metrics + +# Metric histograms by model +plt.style.use('default') +fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics))) + +models = df['Model Name'].unique() +colors = plt.cm.Set3(np.linspace(0, 1, len(models))) + +for i, metric in enumerate(all_metrics): + ax = axes[i] if len(all_metrics) > 1 else axes + + # Create histogram for each model + for j, model in enumerate(models): + model_data = df[df['Model Name'] == model][metric] + ax.hist(model_data, alpha=0.7, label=model, bins=min(8, len(model_data)), + color=colors[j], edgecolor='black', linewidth=0.5) + + ax.set_title(f'{metric} Distribution by Model', fontsize=14, fontweight='bold') + ax.set_xlabel(metric, fontsize=12) + ax.set_ylabel('Frequency', fontsize=12) + ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left') + ax.grid(True, alpha=0.3) + +plt.tight_layout() +plt.show() + +#TODO: Compute count, min, quantiles and max by model +#TODO: Calculate efficiency metrics: Totel Token Usage, Cost per Token, Tokens per Second, Cost per Second + + + The script outputs a CSV with the following columns: * Evaluates LLM outputs for: - * Extractiveness Coverage - * Extractiveness Density - * Extractiveness Compression + * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article + * Extractiveness Density: Average length of the extractive fragement to which each word in the summary belongs + * Extractiveness Compression: Word ratio between the article and the summary * Computes: diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py index 049451f1..4e55690d 100644 --- a/server/api/services/llm_services.py +++ b/server/api/services/llm_services.py @@ -7,10 +7,8 @@ import logging from abc import ABC, abstractmethod -import anthropic import openai - class BaseModelHandler(ABC): @abstractmethod def handle_request( @@ -18,136 +16,7 @@ def handle_request( ) -> tuple[str, dict[str, int], dict[str, float], float]: pass - -class ClaudeHaiku35CitationsHandler(BaseModelHandler): - MODEL = "claude-3-5-haiku-20241022" - # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing - PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.80, "output": 4.00} - - def __init__(self) -> None: - self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) - - def handle_request( - self, query: str, context: str - ) -> tuple[str, dict[str, int], dict[str, float], float]: - """ - Handles the request to the Claude Haiku 3.5 model with citations enabled - - Args: - query: The user query to be processed - context: The context or document content to be used for citations - - """ - - start_time = time.time() - # TODO: Add error handling for API requests and invalid responses - message = self.client.messages.create( - model=self.MODEL, - max_tokens=1024, - messages=[ - { - "role": "user", - "content": [ - {"type": "text", "text": query}, - { - "type": "document", - "source": {"type": "content", "content": context}, - "citations": {"enabled": True}, - }, - ], - } - ], - ) - duration = time.time() - start_time - - # Response Structure: https://docs.anthropic.com/en/docs/build-with-claude/citations#response-structure - - text = [] - cited_text = [] - for content in message.to_dict()["content"]: - text.append(content["text"]) - if "citations" in content.keys(): - text.append( - " ".join( - [ - f"<{citation['start_block_index']} - {citation['end_block_index']}>" - for citation in content["citations"] - ] - ) - ) - cited_text.append( - " ".join( - [ - f"<{citation['start_block_index']} - {citation['end_block_index']}> {citation['cited_text']}" - for citation in content["citations"] - ] - ) - ) - - full_text = " ".join(text) - - return ( - full_text, - message.usage, - self.PRICING_DOLLARS_PER_MILLION_TOKENS, - duration, - ) - - -class ClaudeHaiku3Handler(BaseModelHandler): - MODEL = "claude-3-haiku-20240307" - # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing - PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.25, "output": 1.25} - - def __init__(self) -> None: - self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) - - def handle_request( - self, query: str, context: str - ) -> tuple[str, dict[str, int], dict[str, float], float]: - """ - Handles the request to the Claude Haiku 3 model with citations disabled - - Args: - query: The user query to be processed - context: The context or document content to be used - - """ - - start_time = time.time() - # TODO: Add error handling for API requests and invalid responses - message = self.client.messages.create( - model=self.MODEL, - max_tokens=1024, - messages=[ - { - "role": "user", - "content": [ - {"type": "text", "text": query}, - { - "type": "document", - "source": {"type": "content", "content": context}, - "citations": {"enabled": False}, - }, - ], - } - ], - ) - duration = time.time() - start_time - - text = [] - for content in message.to_dict()["content"]: - text.append(content["text"]) - - full_text = " ".join(text) - - return ( - full_text, - message.usage, - self.PRICING_DOLLARS_PER_MILLION_TOKENS, - duration, - ) - + # Anthropic Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing class GPT4OMiniHandler(BaseModelHandler): MODEL = "gpt-4o-mini" @@ -174,6 +43,7 @@ def handle_request( model=self.MODEL, instructions=query, input=context, + temperature=0.0 ) duration = time.time() - start_time @@ -196,24 +66,55 @@ class GPT41NanoHandler(BaseModelHandler): # Long context performance can degrade as more items are required to be retrieved, # or perform complex reasoning that requires knowledge of the state of the entire context + # + INSTRUCTIONS = """ # Role and Objective + + - You are a seasoned physician or medical professional who is developing a bipolar disorder treatment algorithim - - You are a seasoned physician or medical professional who treats patients with bipolar disorder - - You are analyzing medical research by processing peer-reviewed papers to extract key details + - You are extracting bipolar medication decision points from a research paper that is chunked into multiple parts each labeled with an ID # Instructions - - Identify rules for medication inclusion or exclusion based on medical history or concerns + - Identify decision points for bipolar medications - - Only use retrieved context and never rely on your own knowledge for any of these questions. - - Always follow the provided output format for new messages including citations for any factual statements + - For each decision point you find, return a JSON object using the following format: + + { + "criterion": "", + "decision": "INCLUDE" or "EXCLUDE", + "medications": ["", "", ...], + "reason": "", + "sources": [""] + } - # Output Format - - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). + - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge + + # Output Format + - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array + + # Example + + [ + { + "criterion": "History of suicide attempts", + "decision": "INCLUDE", + "medications": ["Lithium"], + "reason": "Lithium is the only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder", + "sources": ["ID-0"] + }, + { + "criterion": "Weight gain concerns", + "decision": "EXCLUDE", + "medications": ["Quetiapine", "Aripiprazole", "Olanzapine", "Risperidone"], + "reason": "Seroquel, Risperdal, Abilify, and Zyprexa are known for causing weight gain", + "sources": ["ID-0", "ID-1", "ID-2"] + } + ] """ @@ -244,6 +145,7 @@ def handle_request( model=self.MODEL, instructions=query, input=context, + temperature=0.0 ) duration = time.time() - start_time @@ -256,9 +158,12 @@ def handle_request( class ModelFactory: + + #TODO: Define structured fields to extract from unstructured input data + #https://platform.openai.com/docs/guides/structured-outputs?api-mode=responses&example=structured-data#examples + + HANDLERS = { - "CLAUDE_HAIKU_3_5_CITATIONS": ClaudeHaiku35CitationsHandler, - "CLAUDE_HAIKU_3": ClaudeHaiku3Handler, "GPT_4O_MINI": GPT4OMiniHandler, "GPT_41_NANO": GPT41NanoHandler, } From 6e41cf7dc23a47c19575526d3b9837c51553b97a Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Thu, 3 Jul 2025 13:22:57 -0400 Subject: [PATCH 05/15] Refactor README.md: add TODOs --- evaluation/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/evaluation/README.md b/evaluation/README.md index 5e088880..aab19564 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -179,13 +179,13 @@ plt.tight_layout() plt.show() #TODO: Compute count, min, quantiles and max by model -#TODO: Calculate efficiency metrics: Totel Token Usage, Cost per Token, Tokens per Second, Cost per Second - - +#TODO: Calculate efficiency metrics: Total Token Usage, Cost per Token, Tokens per Second, Cost per Second The script outputs a CSV with the following columns: +#TODO: Summarize https://aclanthology.org/N18-1065.pdf + * Evaluates LLM outputs for: * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article @@ -196,4 +196,4 @@ The script outputs a CSV with the following columns: * Token usage (input/output) * Estimated cost in USD - * Duration (in seconds) + * Duration (in seconds) \ No newline at end of file From c483e693a78990f2fbb6f86a410eb41c1e21df27 Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Mon, 7 Jul 2025 21:00:44 -0400 Subject: [PATCH 06/15] DOC Add TODO items, update comments and improve code comments for clarity --- evaluation/README.md | 35 +++++++++++++++++++++++++++++ server/api/services/llm_services.py | 8 ++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/evaluation/README.md b/evaluation/README.md index aab19564..f38097e6 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -1,6 +1,8 @@ # Evaluations +#TODO: Open AI evals documentaiton: https://platform.openai.com/docs/guides/evals + ## LLM Output Evaluator The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost. @@ -12,6 +14,8 @@ It supports batch evalaution via a configuration CSV and produces a detailed met This script evaluates LLM outputs using the `lighteval` library: https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks +##TODO: Use uv to execute scripts without manually manging enviornments https://docs.astral.sh/uv/guides/scripts/ + Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI) configured properly. @@ -138,6 +142,37 @@ df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks' df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False) ``` +``` +echo 'export PATH="/Applications/Postgres.app/Contents/Versions/latest/bin:$PATH"' >> ~/.zshrc +source ~/.zshrc + +createdb backupDBBalancer07012025 +pg_restore -v -d backupDBBalancer07012025 ~/Downloads/backupDBBalancer07012025.sql + +pip install psycopg2-binary + +from sqlalchemy import create_engine +import pandas as pd + +# Alternative: Standard psycopg2 connection (if you get psycopg2 working) +# engine = create_engine("postgresql://sahildshah@localhost:5432/backupDBBalancer07012025") + +# Fixed the variable name (was "database query", now "query") +query = "SELECT * FROM api_embeddings;" + +# Execute the query and load into DataFrame +df = pd.read_sql(query, engine) + +df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1) +# Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining +df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number']) +df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index() +df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'}) +df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False) +``` + + + - Path where the evaluation resuls will be saved import pandas as pd diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py index 4e55690d..55627cf1 100644 --- a/server/api/services/llm_services.py +++ b/server/api/services/llm_services.py @@ -16,6 +16,8 @@ def handle_request( ) -> tuple[str, dict[str, int], dict[str, float], float]: pass +# LLM Pricing Calculator: https://www.llm-prices.com/ + # Anthropic Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing class GPT4OMiniHandler(BaseModelHandler): @@ -78,7 +80,7 @@ class GPT41NanoHandler(BaseModelHandler): # Instructions - - Identify decision points for bipolar medications + - Identify decision points for bipolar medications #TODO: "pharmacological and procedurl interventions" - For each decision point you find, return a JSON object using the following format: @@ -88,11 +90,15 @@ class GPT41NanoHandler(BaseModelHandler): "medications": ["", "", ...], "reason": "", "sources": [""] + "hierarchy": Primary: Contraindictions for allergies + "override" Exclude for allergy } - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge + - TODO: Test against medication indication file + # Output Format - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array From c03d990a21fd60fc219fba5a3d9c3b7c4d98e2b5 Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Wed, 9 Jul 2025 16:27:17 -0400 Subject: [PATCH 07/15] Update README with detailed usage instructions and enhance evals.py to include environment setup and dependencies --- evaluation/README.md | 193 +++++++++------------------- evaluation/evals.py | 18 ++- server/api/services/llm_services.py | 4 +- 3 files changed, 80 insertions(+), 135 deletions(-) mode change 100644 => 100755 evaluation/evals.py diff --git a/evaluation/README.md b/evaluation/README.md index f38097e6..5e95aaab 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -1,31 +1,36 @@ - # Evaluations -#TODO: Open AI evals documentaiton: https://platform.openai.com/docs/guides/evals - ## LLM Output Evaluator -The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost. +The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost -It supports batch evalaution via a configuration CSV and produces a detailed metrics report in CSV format. +This script helps teams compare LLM outputs using extractiveness metrics, token usage, and cost. It is especially useful for evaluating multiple models over a batch of queries and reference answers. -### Usage +It supports batch evaluation via a configuration CSV and produces a detailed metrics report in CSV format. -This script evaluates LLM outputs using the `lighteval` library: -https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks +### Usage -##TODO: Use uv to execute scripts without manually manging enviornments https://docs.astral.sh/uv/guides/scripts/ +Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments: -Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI) configured properly. +```sh +uv run evals.py --config path/to/ --reference path/to/ --output path/to/ +``` +Execute without using uv run by ensuring it is executable: -```bash -python evals.py --config path/to/config.csv --reference path/to/reference.csv --output path/to/results.csv +```sh +./evals.py --config path/to/ --reference path/to/ --output path/to/ ``` The arguments to the script are: - Path to the config CSV file: Must include the columns "Model Name" and "Query" +- Path to the reference CSV file: Must include the columns "Context" and "Reference" +- Path where the evaluation results will be saved + +### Configuration File + +Generate the config CSV file: ``` import pandas as pd @@ -34,84 +39,13 @@ import pandas as pd data = [ { - "Model Name": "GPT_4O_MINI", - "Query": """ - You're analyzing medical text from multiple sources. Each chunk is labeled [chunk-X]. - - Act as a seasoned physician or medical professional who treats patients with bipolar disorder. - - Identify rules for medication inclusion or exclusion based on medical history or concerns. - - For each rule you find, return a JSON object using the following format: - - { - "rule": "", - "type": "INCLUDE" or "EXCLUDE", - "reason": "", - "medications": ["", "", ...], - "source": "" - } - - Only include rules that are explicitly stated or strongly implied in the chunk. - - Only use the chunks provided. If no rule is found in a chunk, skip it. - - Return the entire output as a JSON array. - """ + "Model Name": "", + "Query": """""" }, { - "Model Name": "GPT_41_NANO", - "Query": """ - - # Role and Objective - - - You are a seasoned physician or medical professional who is developing a bipolar disorder treatment algorithim - - - You are extracting bipolar medication decision points from a research paper that is chunked into multiple parts each labeled with an ID - - # Instructions - - - Identify decision points for bipolar medications - - - For each decision point you find, return a JSON object using the following format: - - { - "criterion": "", - "decision": "INCLUDE" or "EXCLUDE", - "medications": ["", "", ...], - "reason": "", - "sources": [""] - } - - - - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge - - # Output Format - - - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array - - # Example - - [ - { - "criterion": "History of suicide attempts", - "decision": "INCLUDE", - "medications": ["Lithium"], - "reason": "Lithium is the only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder", - "sources": ["ID-0"] - }, - { - "criterion": "Weight gain concerns", - "decision": "EXCLUDE", - "medications": ["Quetiapine", "Aripiprazole", "Olanzapine", "Risperidone"], - "reason": "Seroquel, Risperdal, Abilify, and Zyprexa are known for causing weight gain", - "sources": ["ID-0", "ID-1", "ID-2"] - } - ] - - """ - + "Model Name": "", + "Query": """""" }, ] @@ -119,68 +53,79 @@ data = [ df = pd.DataFrame.from_records(data) # Write to CSV -df.to_csv("~/Desktop/evals_config.csv", index=False) +df.to_csv("", index=False) ``` -- Path to the reference CSV file: Must include the columns "Context" and "Reference" +### Reference File + +Generate the reference file by connecting to a database of references + +Connect to the Postgres database of your local Balancer instance: ``` from sqlalchemy import create_engine -import pandas as pd engine = create_engine("postgresql+psycopg2://balancer:balancer@localhost:5433/balancer_dev") -# Filter out papers that shouldn't be used from local database -query = "SELECT * FROM api_embeddings WHERE date_of_upload > '2025-03-14';" -df = pd.read_sql(query, engine) - -df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1) -# Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining -df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number']) -df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index() -df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'}) -df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False) ``` +Connect to the Postgres database of the production Balancer instance using a SQL file: + ``` +# Install Postgres.app and add binaries to the PATH echo 'export PATH="/Applications/Postgres.app/Contents/Versions/latest/bin:$PATH"' >> ~/.zshrc -source ~/.zshrc -createdb backupDBBalancer07012025 -pg_restore -v -d backupDBBalancer07012025 ~/Downloads/backupDBBalancer07012025.sql +createdb +pg_restore -v -d .sql -pip install psycopg2-binary +engine = create_engine("postgresql://@localhost:5432/") +``` -from sqlalchemy import create_engine -import pandas as pd +Generate the reference CSV file: -# Alternative: Standard psycopg2 connection (if you get psycopg2 working) -# engine = create_engine("postgresql://sahildshah@localhost:5432/backupDBBalancer07012025") +``` +import pandas as pd -# Fixed the variable name (was "database query", now "query") query = "SELECT * FROM api_embeddings;" - -# Execute the query and load into DataFrame df = pd.read_sql(query, engine) df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1) + # Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number']) df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index() + df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'}) -df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False) +df_grouped.to_csv('', index=False) ``` +### Output File + +The script outputs a CSV with the following columns: + +Extractiveness Metrics based on the methodology from: https://aclanthology.org/N18-1065.pdf + +* Evaluates LLM outputs for: + + * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article + * Extractiveness Density: Average length of the extractive fragment to which each word in the summary belongs + * Extractiveness Compression: Word ratio between the article and the summary + +* Computes: + * Token usage (input/output) + * Estimated cost in USD + * Duration (in seconds) -- Path where the evaluation resuls will be saved +Exploratory data analysis: + +``` import pandas as pd import matplotlib.pyplot as plt import numpy as np - -df = pd.read_csv("~/Desktop/evals_out-20250702.csv") +df = pd.read_csv("") # Define the metrics of interest extractiveness_cols = ['Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression'] @@ -213,22 +158,6 @@ for i, metric in enumerate(all_metrics): plt.tight_layout() plt.show() -#TODO: Compute count, min, quantiles and max by model #TODO: Calculate efficiency metrics: Total Token Usage, Cost per Token, Tokens per Second, Cost per Second - -The script outputs a CSV with the following columns: - -#TODO: Summarize https://aclanthology.org/N18-1065.pdf - -* Evaluates LLM outputs for: - - * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article - * Extractiveness Density: Average length of the extractive fragement to which each word in the summary belongs - * Extractiveness Compression: Word ratio between the article and the summary - -* Computes: - - * Token usage (input/output) - * Estimated cost in USD - * Duration (in seconds) \ No newline at end of file +``` \ No newline at end of file diff --git a/evaluation/evals.py b/evaluation/evals.py old mode 100644 new mode 100755 index a263d3bc..9e597d3f --- a/evaluation/evals.py +++ b/evaluation/evals.py @@ -1,10 +1,24 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = "==3.11.11" +# dependencies = [ +# "pandas==2.2.3", +# "lighteval==0.10.0", +# "openai==1.83.0" +# ] +# /// + """ Evaluate LLM outputs using multiple metrics and compute associated costs """ -#TODO: Run this script with uv to manage dependencies +#This script evaluates LLM outputs using the `lighteval` library +#https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks + +#This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist + -# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs +#TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs import sys import os diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py index 55627cf1..7137f026 100644 --- a/server/api/services/llm_services.py +++ b/server/api/services/llm_services.py @@ -17,11 +17,13 @@ def handle_request( pass # LLM Pricing Calculator: https://www.llm-prices.com/ +# TODO: Add support for more models and their pricing - # Anthropic Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing +# Anthropic Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing class GPT4OMiniHandler(BaseModelHandler): MODEL = "gpt-4o-mini" + # TODO: Get the latest model pricing from OpenAI's API or documentation # Model Pricing: https://platform.openai.com/docs/pricing PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.15, "output": 0.60} From 4f8cbad2803381fbd9c0078b55a5a0ba43ca1fc9 Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Fri, 11 Jul 2025 12:04:13 -0400 Subject: [PATCH 08/15] Update README to clarify the purpose and usage of the script, --- evaluation/README.md | 123 ++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 65 deletions(-) diff --git a/evaluation/README.md b/evaluation/README.md index 5e95aaab..c06353f4 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -1,63 +1,27 @@ # Evaluations -## LLM Output Evaluator +## `evals`: LLM evaluations to test and improve model outputs -The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost +LLM evals test a prompt with a set of test data by scoring each item in the data set -This script helps teams compare LLM outputs using extractiveness metrics, token usage, and cost. It is especially useful for evaluating multiple models over a batch of queries and reference answers. +To test Balancer's structured text extraction of medication rules, `evals` computes: -It supports batch evaluation via a configuration CSV and produces a detailed metrics report in CSV format. +[Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks): -### Usage +* Extractiveness Coverage: + - Percentage of words in the summary that are part of an extractive fragment with the article +* Extractiveness Density: + - Average length of the extractive fragment to which each word in the summary belongs +* Extractiveness Compression: + - Word ratio between the article and the summary -Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments: - -```sh -uv run evals.py --config path/to/ --reference path/to/ --output path/to/ -``` - -Execute without using uv run by ensuring it is executable: - -```sh -./evals.py --config path/to/ --reference path/to/ --output path/to/ -``` - -The arguments to the script are: - -- Path to the config CSV file: Must include the columns "Model Name" and "Query" -- Path to the reference CSV file: Must include the columns "Context" and "Reference" -- Path where the evaluation results will be saved - -### Configuration File - -Generate the config CSV file: - -``` -import pandas as pd - -# Define the data -data = [ - - { - "Model Name": "", - "Query": """""" - }, - - { - "Model Name": "", - "Query": """""" - }, -] - -# Create DataFrame from records -df = pd.DataFrame.from_records(data) - -# Write to CSV -df.to_csv("", index=False) -``` +API usage: +* Token usage (input/output) +* Estimated cost in USD +* Duration (in seconds) -### Reference File +### Test Data: Generate the reference file by connecting to a database of references @@ -77,7 +41,10 @@ echo 'export PATH="/Applications/Postgres.app/Contents/Versions/latest/bin:$PATH createdb pg_restore -v -d .sql +``` +``` +from sqlalchemy import create_engine engine = create_engine("postgresql://@localhost:5432/") ``` @@ -99,26 +66,54 @@ df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks' df_grouped.to_csv('', index=False) ``` -### Output File -The script outputs a CSV with the following columns: +### Running an Evaluation -Extractiveness Metrics based on the methodology from: https://aclanthology.org/N18-1065.pdf +#### Test Input: Bulk model and prompt experimentation -* Evaluates LLM outputs for: +Compare the results of many different prompts and models at once - * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article - * Extractiveness Density: Average length of the extractive fragment to which each word in the summary belongs - * Extractiveness Compression: Word ratio between the article and the summary +``` +import pandas as pd -* Computes: +# Define the data +data = [ - * Token usage (input/output) - * Estimated cost in USD - * Duration (in seconds) + { + "Model Name": "", + "Query": """""" + }, + { + "Model Name": "", + "Query": """""" + }, +] -Exploratory data analysis: +# Create DataFrame from records +df = pd.DataFrame.from_records(data) + +# Write to CSV +df.to_csv("", index=False) +``` + + +#### Execute on the command line + + +Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments: + +```sh +uv run evals.py --config path/to/ --reference path/to/ --output path/to/ +``` + +Execute without using uv run by ensuring it is executable: + +```sh +./evals.py --config path/to/ --reference path/to/ --output path/to/ +``` + +### Analyzing Test Results ``` import pandas as pd @@ -158,6 +153,4 @@ for i, metric in enumerate(all_metrics): plt.tight_layout() plt.show() -#TODO: Calculate efficiency metrics: Total Token Usage, Cost per Token, Tokens per Second, Cost per Second - ``` \ No newline at end of file From fe302b5ef5b6c254adcd9b9065e3552563b8403d Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Fri, 11 Jul 2025 12:07:07 -0400 Subject: [PATCH 09/15] ADD TODOs --- evaluation/evals.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/evaluation/evals.py b/evaluation/evals.py index 9e597d3f..e13e79c8 100755 --- a/evaluation/evals.py +++ b/evaluation/evals.py @@ -3,7 +3,7 @@ # requires-python = "==3.11.11" # dependencies = [ # "pandas==2.2.3", -# "lighteval==0.10.0", +# "lighteval==0.10.0", # "openai==1.83.0" # ] # /// @@ -12,13 +12,13 @@ Evaluate LLM outputs using multiple metrics and compute associated costs """ -#This script evaluates LLM outputs using the `lighteval` library -#https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks +# This script evaluates LLM outputs using the `lighteval` library +# https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks -#This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist +# This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist -#TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs +# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs import sys import os @@ -40,9 +40,7 @@ ) -def evaluate_response( - model_name: str, query: str, context: str -) -> pd.DataFrame: +def evaluate_response(model_name: str, query: str, context: str) -> pd.DataFrame: """ Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost @@ -91,6 +89,8 @@ def evaluate_response( if __name__ == "__main__": + # TODO: Add test evaluation argument to run on the first 10 rows of the config file + # TODO: Add CLI argument to specify the metrics to be computed parser = argparse.ArgumentParser( description="Evaluate LLM outputs using multiple metrics and compute associated costs" @@ -149,9 +149,7 @@ def evaluate_response( df_evals = pd.concat( [ df_evals, - evaluate_response( - row["Model Name"], row["Query"], row["Context"] - ), + evaluate_response(row["Model Name"], row["Query"], row["Context"]), ], axis=0, ) From 3c9a1c9127cabc560fde3507c687c4d39e74d576 Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Fri, 11 Jul 2025 15:30:52 -0400 Subject: [PATCH 10/15] Update README for clearer instructions, refactor evals.py for better error handling, column validation, and batch processing --- evaluation/README.md | 21 ++--- evaluation/evals.py | 220 ++++++++++++++++++++++++------------------- 2 files changed, 133 insertions(+), 108 deletions(-) diff --git a/evaluation/README.md b/evaluation/README.md index c06353f4..9e8cfa5d 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -21,9 +21,9 @@ API usage: * Estimated cost in USD * Duration (in seconds) -### Test Data: +### Test Data -Generate the reference file by connecting to a database of references +Generate the dataset file by connecting to a database of references Connect to the Postgres database of your local Balancer instance: @@ -48,7 +48,7 @@ from sqlalchemy import create_engine engine = create_engine("postgresql://@localhost:5432/") ``` -Generate the reference CSV file: +Generate the dataset CSV file: ``` import pandas as pd @@ -63,7 +63,7 @@ df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number']) df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index() df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'}) -df_grouped.to_csv('', index=False) +df_grouped.to_csv('', index=False) ``` @@ -94,7 +94,7 @@ data = [ df = pd.DataFrame.from_records(data) # Write to CSV -df.to_csv("", index=False) +df.to_csv("", index=False) ``` @@ -104,13 +104,13 @@ df.to_csv("", index=False) Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments: ```sh -uv run evals.py --config path/to/ --reference path/to/ --output path/to/ +uv run evals.py --experiments path/to/ --dataset path/to/ --results path/to/ ``` Execute without using uv run by ensuring it is executable: ```sh -./evals.py --config path/to/ --reference path/to/ --output path/to/ +./evals.py --experiments path/to/ --dataset path/to/ --results path/to/ ``` ### Analyzing Test Results @@ -120,7 +120,7 @@ import pandas as pd import matplotlib.pyplot as plt import numpy as np -df = pd.read_csv("") +df = pd.read_csv("") # Define the metrics of interest extractiveness_cols = ['Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression'] @@ -132,7 +132,7 @@ all_metrics = extractiveness_cols + token_cols + other_metrics plt.style.use('default') fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics))) -models = df['Model Name'].unique() +models = df['MODEL'].unique() colors = plt.cm.Set3(np.linspace(0, 1, len(models))) for i, metric in enumerate(all_metrics): @@ -140,7 +140,7 @@ for i, metric in enumerate(all_metrics): # Create histogram for each model for j, model in enumerate(models): - model_data = df[df['Model Name'] == model][metric] + model_data = df[df['MODEL'] == model][metric] ax.hist(model_data, alpha=0.7, label=model, bins=min(8, len(model_data)), color=colors[j], edgecolor='black', linewidth=0.5) @@ -152,5 +152,4 @@ for i, metric in enumerate(all_metrics): plt.tight_layout() plt.show() - ``` \ No newline at end of file diff --git a/evaluation/evals.py b/evaluation/evals.py index e13e79c8..9c2a30b7 100755 --- a/evaluation/evals.py +++ b/evaluation/evals.py @@ -12,14 +12,6 @@ Evaluate LLM outputs using multiple metrics and compute associated costs """ -# This script evaluates LLM outputs using the `lighteval` library -# https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks - -# This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist - - -# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs - import sys import os @@ -30,6 +22,8 @@ import logging import pandas as pd + +# lighteval depends on `sentencepiece` and it only has prebuilt wheels for Python 3.11 or below from lighteval.tasks.requests import Doc from lighteval.metrics.metrics_sample import Extractiveness @@ -40,129 +34,161 @@ ) -def evaluate_response(model_name: str, query: str, context: str) -> pd.DataFrame: +def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame: + """ + Test a prompt with a set of test data by scoring each item in the data set """ - Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost - Args: - model_name (str): The name of the model to be used for evaluation. - query (str): The user query to be processed. - context (str): The context or document content to be used. - reference (str): The reference text for comparison (not used in this function, but can be used for further evaluations). + try: + handler = ModelFactory.get_handler(model) - Returns: - pd.DataFrame: A DataFrame containing the output text, extractiveness metrics, token usage, cost, and duration. - """ + generated_text, token_usage, pricing, duration = handler.handle_request( + instructions, input + ) - handler = ModelFactory.get_handler(model_name) + doc = Doc(query="", choices=[], gold_index=0, specific={"text": input}) + extractiveness = Extractiveness().compute( + formatted_doc=doc, predictions=[generated_text] + ) - # TODO: Add error handling for unsupported models + cost_metrics = calculate_cost_metrics(token_usage, pricing) - output_text, token_usage, pricing, duration = handler.handle_request(query, context) + result = pd.DataFrame( + [ + { + "Generated Text": generated_text, + "Extractiveness Coverage": extractiveness["summarization_coverage"], + "Extractiveness Density": extractiveness["summarization_density"], + "Extractiveness Compression": extractiveness[ + "summarization_compression" + ], + "Input Token Usage": token_usage.input_tokens, + "Output Token Usage": token_usage.output_tokens, + "Cost (USD)": cost_metrics["total_cost"], + "Duration (s)": duration, + } + ] + ) - doc = Doc(query="", choices=[], gold_index=0, specific={"text": context}) - extractiveness = Extractiveness().compute( - formatted_doc=doc, predictions=[output_text] - ) + except Exception as e: + logging.error(f"Error evaluating response for model {model}: {e}") + result = pd.DataFrame( + [ + { + "Generated Text": None, + "Extractiveness Coverage": None, + "Extractiveness Density": None, + "Extractiveness Compression": None, + "Input Token Usage": None, + "Output Token Usage": None, + "Cost (USD)": None, + "Duration (s)": None, + } + ] + ) + + return result + + +def calculate_cost_metrics(token_usage: dict, pricing: dict) -> dict: + """ + Calculate cost metrics based on token usage and pricing + """ - input_cost_dollars = (pricing["input"] / 1000000) * token_usage.input_tokens - output_cost_dollars = (pricing["output"] / 1000000) * token_usage.output_tokens + TOKENS_PER_MILLION = 1_000_000 + # Pricing is in dollars per million tokens + input_cost_dollars = ( + pricing["input"] / TOKENS_PER_MILLION + ) * token_usage.input_tokens + output_cost_dollars = ( + pricing["output"] / TOKENS_PER_MILLION + ) * token_usage.output_tokens total_cost_dollars = input_cost_dollars + output_cost_dollars - return pd.DataFrame( - [ - { - "Output Text": output_text, - "Extractiveness Coverage": extractiveness["summarization_coverage"], - "Extractiveness Density": extractiveness["summarization_density"], - "Extractiveness Compression": extractiveness[ - "summarization_compression" - ], - "Input Token Usage": token_usage.input_tokens, - "Output Token Usage": token_usage.output_tokens, - "Cost (USD)": total_cost_dollars, - "Duration (s)": duration, - } - ] - ) + return { + "input_cost": input_cost_dollars, + "output_cost": output_cost_dollars, + "total_cost": total_cost_dollars, + } -if __name__ == "__main__": - # TODO: Add test evaluation argument to run on the first 10 rows of the config file +def load_csv(file_path: str, required_columns: list) -> pd.DataFrame: + """ + Load a CSV file and validate that it contains the required columns - # TODO: Add CLI argument to specify the metrics to be computed - parser = argparse.ArgumentParser( - description="Evaluate LLM outputs using multiple metrics and compute associated costs" - ) - parser.add_argument("--config", "-c", required=True, help="Path to config CSV file") - parser.add_argument( - "--reference", "-r", required=True, help="Path to reference CSV file" - ) - parser.add_argument("--output", "-o", required=True, help="Path to output CSV file") + Args: + file_path (str): Path to the CSV file + required_columns (list): List of required column names - args = parser.parse_args() + Returns: + pd.DataFrame + """ + + df = pd.read_csv(file_path) - df_config = pd.read_csv(args.config) - logging.info(f"Config DataFrame shape: {df_config.shape}") - logging.info(f"Config DataFrame columns: {df_config.columns.tolist()}") + # Remove trailing whitespace from column names + df.columns = df.columns.str.strip() - # Remove the trailing whitespace from column names - df_config.columns = df_config.columns.str.strip() + # Uppercase the column names to match the expected format + df.columns = df.columns.str.upper() # Check if the required columns are present - # TODO: Make this more flexible by allowing the user to use default instructions - required_columns = ["Model Name", "Query"] - if not all(col in df_config.columns for col in required_columns): + if not all(col in df.columns for col in required_columns): raise ValueError( - f"Config DataFrame must contain the following columns: {required_columns}" + f"{file_path} must contain the following columns: {required_columns}" ) - # Check if all models in the config are supported by ModelFactory + return df + + +if __name__ == "__main__": + # TODO: Add test evaluation argument to run on the first 10 rows of the dataset file + + parser = argparse.ArgumentParser() + parser.add_argument( + "--experiments", "-e", required=True, help="Path to experiments CSV file" + ) + parser.add_argument( + "--dataset", "-d", required=True, help="Path to dataset CSV file" + ) + parser.add_argument( + "--results", "-r", required=True, help="Path to results CSV file" + ) + + args = parser.parse_args() + + df_experiment = load_csv( + args.experiments, required_columns=["MODEL", "INSTRUCTIONS"] + ) + # Check if all models are supported by ModelFactory if not all( model in ModelFactory.HANDLERS.keys() - for model in df_config["Model Name"].unique() + for model in df_experiment["MODEL"].unique() ): raise ValueError( - f"Unsupported model(s) found in config: {set(df_config['Model Name'].unique()) - set(ModelFactory.HANDLERS.keys())}" - ) - - df_reference = pd.read_csv(args.reference) - logging.info(f"Reference DataFrame shape: {df_reference.shape}") - logging.info(f"Reference DataFrame columns: {df_reference.columns.tolist()}") - - # Remove the trailing whitespace from column names - df_reference.columns = df_reference.columns.str.strip() - # Check if the required columns are present - required_columns = ["Context"] - if not all(col in df_reference.columns for col in required_columns): - raise ValueError( - f"Reference DataFrame must contain the following columns: {required_columns}" + f"Unsupported model(s) found: {set(df_experiment['MODEL'].unique()) - set(ModelFactory.HANDLERS.keys())}" ) + df_dataset = load_csv(args.dataset, required_columns=["INPUT"]) - # Cross join the config and reference DataFrames - df_in = df_config.merge(df_reference, how="cross") + # Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames + df_in = df_experiment.merge(df_dataset, how="cross") - # TODO: Parallelize the evaluation process for each row in df_in using concurrent.futures or similar libraries - df_evals = pd.DataFrame() - for index, row in df_in.iterrows(): - df_evals = pd.concat( - [ - df_evals, - evaluate_response(row["Model Name"], row["Query"], row["Context"]), - ], - axis=0, - ) + # Evaluate each row in the input DataFrame + results = [] + for index, row in enumerate(df_in.itertuples(index=False)): + result = evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT) + results.append(result) + # TODO: Use tqdm or similar library to show progress bar logging.info(f"Processed row {index + 1}/{len(df_in)}") - # Concatenate the input and evaluations DataFrames + df_evals = pd.concat(results, axis=0, ignore_index=True) + # Concatenate the input and evaluations DataFrames df_out = pd.concat( [df_in.reset_index(drop=True), df_evals.reset_index(drop=True)], axis=1 ) - - df_out.to_csv(args.output, index=False) - logging.info(f"Output DataFrame shape: {df_out.shape}") - logging.info(f"Results saved to {args.output}") + df_out.to_csv(args.results, index=False) + logging.info(f"Results saved to {args.results}") logging.info("Evaluation completed successfully.") From d1dd75c2c9277e612473df049a2e482821d659b0 Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Tue, 15 Jul 2025 10:25:10 -0400 Subject: [PATCH 11/15] Update evaluation README with metrics and API usage details, and add 'Contributing' section --- evaluation/README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/evaluation/README.md b/evaluation/README.md index 9e8cfa5d..48686950 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -2,12 +2,12 @@ ## `evals`: LLM evaluations to test and improve model outputs -LLM evals test a prompt with a set of test data by scoring each item in the data set - -To test Balancer's structured text extraction of medication rules, `evals` computes: +### Metrics [Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks): +Natural Language Generation Performance: + * Extractiveness Coverage: - Percentage of words in the summary that are part of an extractive fragment with the article * Extractiveness Density: @@ -15,10 +15,10 @@ To test Balancer's structured text extraction of medication rules, `evals` compu * Extractiveness Compression: - Word ratio between the article and the summary -API usage: +API Performance: -* Token usage (input/output) -* Estimated cost in USD +* Token Usage (input/output) +* Estimated Cost in USD * Duration (in seconds) ### Test Data @@ -152,4 +152,7 @@ for i, metric in enumerate(all_metrics): plt.tight_layout() plt.show() -``` \ No newline at end of file + +``` + +### Contributing From eef2a29d117e40f619a3dcac1f073aeee2555c6f Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Tue, 15 Jul 2025 10:54:50 -0400 Subject: [PATCH 12/15] Update evaluation instructions, improve dataset generation section, and clarify external tools --- evaluation/README.md | 45 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/evaluation/README.md b/evaluation/README.md index 48686950..ddaf12c9 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -2,12 +2,12 @@ ## `evals`: LLM evaluations to test and improve model outputs -### Metrics - -[Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks): +### Evaluation Metrics Natural Language Generation Performance: +[Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks): + * Extractiveness Coverage: - Percentage of words in the summary that are part of an extractive fragment with the article * Extractiveness Density: @@ -23,7 +23,7 @@ API Performance: ### Test Data -Generate the dataset file by connecting to a database of references +Generate the dataset file by connecting to a database of research papers: Connect to the Postgres database of your local Balancer instance: @@ -36,72 +36,63 @@ engine = create_engine("postgresql+psycopg2://balancer:balancer@localhost:5433/b Connect to the Postgres database of the production Balancer instance using a SQL file: ``` -# Install Postgres.app and add binaries to the PATH +# Add Postgres.app binaries to the PATH echo 'export PATH="/Applications/Postgres.app/Contents/Versions/latest/bin:$PATH"' >> ~/.zshrc createdb pg_restore -v -d .sql ``` -``` -from sqlalchemy import create_engine -engine = create_engine("postgresql://@localhost:5432/") -``` - Generate the dataset CSV file: ``` +from sqlalchemy import create_engine import pandas as pd +engine = create_engine("postgresql://@localhost:5432/") + query = "SELECT * FROM api_embeddings;" df = pd.read_sql(query, engine) -df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1) +df['INPUT'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1) # Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number']) -df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index() +df_grouped = df.groupby(['name', 'upload_file_id'])['INPUT'].apply(lambda chunks: "\n".join(chunks)).reset_index() -df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'}) df_grouped.to_csv('', index=False) ``` - ### Running an Evaluation -#### Test Input: Bulk model and prompt experimentation +#### Bulk Model and Prompt Experimentation Compare the results of many different prompts and models at once ``` import pandas as pd -# Define the data data = [ - { - "Model Name": "", - "Query": """""" + "MODEL": "", + "INSTRUCTIONS": """""" }, - { - "Model Name": "", - "Query": """""" + "MODEL": "", + "INSTRUCTIONS": """""" }, ] -# Create DataFrame from records df = pd.DataFrame.from_records(data) -# Write to CSV df.to_csv("", index=False) ``` -#### Execute on the command line +#### Execute on the Command Line -Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments: +Execute [using `uv` to manage dependencies](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments: ```sh uv run evals.py --experiments path/to/ --dataset path/to/ --results path/to/ @@ -156,3 +147,5 @@ plt.show() ``` ### Contributing + +You're welcome to add LLM models to test in `server/api/services/llm_services` \ No newline at end of file From ffa86f7d99f8e1df4dcf2dbedae9a56e7efe20dc Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Tue, 15 Jul 2025 11:57:32 -0400 Subject: [PATCH 13/15] Update dependencies list in and correct comment syntax in --- evaluation/evals.py | 5 ++++- server/api/services/llm_services.py | 28 +++++++++------------------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/evaluation/evals.py b/evaluation/evals.py index 9c2a30b7..08eda2bc 100755 --- a/evaluation/evals.py +++ b/evaluation/evals.py @@ -4,7 +4,10 @@ # dependencies = [ # "pandas==2.2.3", # "lighteval==0.10.0", -# "openai==1.83.0" +# "openai==1.83.0", +# "spacy==3.8.7", +# "pip" +# # ] # /// diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py index 7137f026..18c6e58f 100644 --- a/server/api/services/llm_services.py +++ b/server/api/services/llm_services.py @@ -9,6 +9,7 @@ import openai + class BaseModelHandler(ABC): @abstractmethod def handle_request( @@ -16,11 +17,13 @@ def handle_request( ) -> tuple[str, dict[str, int], dict[str, float], float]: pass + # LLM Pricing Calculator: https://www.llm-prices.com/ # TODO: Add support for more models and their pricing # Anthropic Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing + class GPT4OMiniHandler(BaseModelHandler): MODEL = "gpt-4o-mini" # TODO: Get the latest model pricing from OpenAI's API or documentation @@ -44,10 +47,7 @@ def handle_request( start_time = time.time() # TODO: Add error handling for API requests and invalid responses response = self.client.responses.create( - model=self.MODEL, - instructions=query, - input=context, - temperature=0.0 + model=self.MODEL, instructions=query, input=context, temperature=0.0 ) duration = time.time() - start_time @@ -67,7 +67,7 @@ class GPT41NanoHandler(BaseModelHandler): # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide - # Long context performance can degrade as more items are required to be retrieved, + # Long context performance can degrade as more items are required to be retrieved, # or perform complex reasoning that requires knowledge of the state of the entire context # @@ -82,7 +82,7 @@ class GPT41NanoHandler(BaseModelHandler): # Instructions - - Identify decision points for bipolar medications #TODO: "pharmacological and procedurl interventions" + - Identify decision points for bipolar medications - For each decision point you find, return a JSON object using the following format: @@ -92,15 +92,11 @@ class GPT41NanoHandler(BaseModelHandler): "medications": ["", "", ...], "reason": "", "sources": [""] - "hierarchy": Primary: Contraindictions for allergies - "override" Exclude for allergy } - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge - - TODO: Test against medication indication file - # Output Format - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array @@ -145,15 +141,11 @@ def handle_request( if not query: query = self.INSTRUCTIONS - start_time = time.time() # TODO: Add error handling for API requests and invalid responses response = self.client.responses.create( - model=self.MODEL, - instructions=query, - input=context, - temperature=0.0 + model=self.MODEL, instructions=query, input=context, temperature=0.0 ) duration = time.time() - start_time @@ -166,10 +158,8 @@ def handle_request( class ModelFactory: - - #TODO: Define structured fields to extract from unstructured input data - #https://platform.openai.com/docs/guides/structured-outputs?api-mode=responses&example=structured-data#examples - + # TODO: Define structured fields to extract from unstructured input data + # https://platform.openai.com/docs/guides/structured-outputs?api-mode=responses&example=structured-data#examples HANDLERS = { "GPT_4O_MINI": GPT4OMiniHandler, From 42a494951a9c83f6f421c74ee70d6c6e7f0e6a36 Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Tue, 15 Jul 2025 20:30:28 -0400 Subject: [PATCH 14/15] Update README.md --- evaluation/README.md | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/evaluation/README.md b/evaluation/README.md index ddaf12c9..6e1a1cf2 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -9,11 +9,16 @@ Natural Language Generation Performance: [Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks): * Extractiveness Coverage: + - Extent to which a summary is derivative of a text - Percentage of words in the summary that are part of an extractive fragment with the article * Extractiveness Density: + - How well the word sequence can be described as series of extractions + - A summary might contain many individual words from the article and therefore have a high coverage. + - However, if arranged in a new order, the words of the summary could still be used to convey ideas not present in the article - Average length of the extractive fragment to which each word in the summary belongs * Extractiveness Compression: - Word ratio between the article and the summary + - Summarizing with higher compression is challenging as it requires capturing more precisely the critical aspects of the article text. API Performance: @@ -119,7 +124,7 @@ token_cols = ['Input Token Usage', 'Output Token Usage'] other_metrics = ['Cost (USD)', 'Duration (s)'] all_metrics = extractiveness_cols + token_cols + other_metrics -# Metric histograms by model +# Metric Histograms by Model plt.style.use('default') fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics))) @@ -144,6 +149,36 @@ for i, metric in enumerate(all_metrics): plt.tight_layout() plt.show() +# Metric Statistics by Model +for metric in all_metrics: + print(f"\n{metric.upper()}:") + desc_stats = df.groupby('MODEL')[metric].agg([ + 'count', 'mean', 'std', 'min', 'median','max' + ]) + + print(desc_stats) + + +# Calculate Efficiency Metrics By model +df_analysis = df.copy() +df_analysis['Total Token Usage'] = df_analysis['Input Token Usage'] + df_analysis['Output Token Usage'] +df_analysis['Cost per Token'] = df_analysis['Cost (USD)'] / df_analysis['Total Token Usage'] +df_analysis['Tokens per Second'] = df_analysis['Total Token Usage'] / df_analysis['Duration (s)'] +df_analysis['Cost per Second'] = df_analysis['Cost (USD)'] / df_analysis['Duration (s)'] + +efficiency_metrics = ['Cost per Token', 'Tokens per Second', 'Cost per Second'] + +for metric in efficiency_metrics: + print(f"\n{metric.upper()}:") + eff_stats = df_analysis.groupby('MODEL')[metric].agg([ + 'count', 'mean', 'std', 'min', 'median', 'max' + ]) + + for col in ['mean', 'std', 'min', 'median', 'max']: + eff_stats[col] = eff_stats[col].apply(lambda x: f"{x:.3g}") + print(eff_stats) + + ``` ### Contributing From 0e2893b413ba2b318b2e43ff0177c36107cd4c7a Mon Sep 17 00:00:00 2001 From: Sahil D Shah Date: Tue, 15 Jul 2025 20:59:26 -0400 Subject: [PATCH 15/15] Update README.md --- evaluation/README.md | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/evaluation/README.md b/evaluation/README.md index 6e1a1cf2..669141d8 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -8,17 +8,9 @@ Natural Language Generation Performance: [Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks): -* Extractiveness Coverage: - - Extent to which a summary is derivative of a text - - Percentage of words in the summary that are part of an extractive fragment with the article -* Extractiveness Density: - - How well the word sequence can be described as series of extractions - - A summary might contain many individual words from the article and therefore have a high coverage. - - However, if arranged in a new order, the words of the summary could still be used to convey ideas not present in the article - - Average length of the extractive fragment to which each word in the summary belongs -* Extractiveness Compression: - - Word ratio between the article and the summary - - Summarizing with higher compression is challenging as it requires capturing more precisely the critical aspects of the article text. +* Extractiveness Coverage: Extent to which a summary is derivative of a text +* Extractiveness Density: How well the word sequence can be described as series of extractions +* Extractiveness Compression: Word ratio between the article and the summary API Performance: