From 3a174576d21156f99f79e0bde40c9d48360399bf Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Wed, 18 Jun 2025 17:47:22 -0400
Subject: [PATCH 01/15] GPT41 Nano prompting

---
 server/api/services/llm_services.py       | 323 ++++++++++++++++++++++
 server/api/views/text_extraction/views.py |  25 +-
 2 files changed, 343 insertions(+), 5 deletions(-)
 create mode 100644 server/api/services/llm_services.py

diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py
new file mode 100644
index 00000000..7ab9c176
--- /dev/null
+++ b/server/api/services/llm_services.py
@@ -0,0 +1,323 @@
+"""
+This module contains functions to interact with different AI models
+"""
+
+import os
+import time
+import logging
+from abc import ABC, abstractmethod
+
+import anthropic
+import openai
+
+
+class BaseModelHandler(ABC):
+    @abstractmethod
+    def handle_request(
+        self, query: str, context: str
+    ) -> tuple[str, dict[str, int], dict[str, float], float]:
+        pass
+
+
+class ClaudeHaiku35CitationsHandler(BaseModelHandler):
+    MODEL = "claude-3-5-haiku-20241022"
+    # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
+    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.80, "output": 4.00}
+
+    def __init__(self) -> None:
+        self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
+
+    def handle_request(
+        self, query: str, context: str
+    ) -> tuple[str, dict[str, int], dict[str, float], float]:
+        """
+        Handles the request to the Claude Haiku 3.5 model with citations enabled
+
+        Args:
+            query: The user query to be processed
+            context: The context or document content to be used for citations
+
+        """
+
+        start_time = time.time()
+        # TODO: Add error handling for API requests and invalid responses
+        message = self.client.messages.create(
+            model=self.MODEL,
+            max_tokens=1024,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": query},
+                        {
+                            "type": "document",
+                            "source": {"type": "content", "content": context},
+                            "citations": {"enabled": True},
+                        },
+                    ],
+                }
+            ],
+        )
+        duration = time.time() - start_time
+
+        # Response Structure: https://docs.anthropic.com/en/docs/build-with-claude/citations#response-structure
+
+        text = []
+        cited_text = []
+        for content in message.to_dict()["content"]:
+            text.append(content["text"])
+            if "citations" in content.keys():
+                text.append(
+                    " ".join(
+                        [
+                            f"<{citation['start_block_index']} - {citation['end_block_index']}>"
+                            for citation in content["citations"]
+                        ]
+                    )
+                )
+                cited_text.append(
+                    " ".join(
+                        [
+                            f"<{citation['start_block_index']} - {citation['end_block_index']}> {citation['cited_text']}"
+                            for citation in content["citations"]
+                        ]
+                    )
+                )
+
+        full_text = " ".join(text)
+
+        return (
+            full_text,
+            message.usage,
+            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
+            duration,
+        )
+
+
+class ClaudeHaiku3Handler(BaseModelHandler):
+    MODEL = "claude-3-haiku-20240307"
+    # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
+    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.25, "output": 1.25}
+
+    def __init__(self) -> None:
+        self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
+
+    def handle_request(
+        self, query: str, context: str
+    ) -> tuple[str, dict[str, int], dict[str, float], float]:
+        """
+        Handles the request to the Claude Haiku 3 model with citations disabled
+
+        Args:
+            query: The user query to be processed
+            context: The context or document content to be used
+
+        """
+
+        start_time = time.time()
+        # TODO: Add error handling for API requests and invalid responses
+        message = self.client.messages.create(
+            model=self.MODEL,
+            max_tokens=1024,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": query},
+                        {
+                            "type": "document",
+                            "source": {"type": "content", "content": context},
+                            "citations": {"enabled": False},
+                        },
+                    ],
+                }
+            ],
+        )
+        duration = time.time() - start_time
+
+        text = []
+        for content in message.to_dict()["content"]:
+            text.append(content["text"])
+
+        full_text = " ".join(text)
+
+        return (
+            full_text,
+            message.usage,
+            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
+            duration,
+        )
+
+
+class GPT4OMiniHandler(BaseModelHandler):
+    MODEL = "gpt-4o-mini"
+    # Model Pricing: https://platform.openai.com/docs/pricing
+    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.15, "output": 0.60}
+
+    def __init__(self) -> None:
+        self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+
+    def handle_request(
+        self, query: str, context: str
+    ) -> tuple[str, dict[str, int], dict[str, float], float]:
+        """
+        Handles the request to the GPT-4o Mini model
+
+        Args:
+            query: The user query to be processed
+            context: The context or document content to be used
+
+        """
+        start_time = time.time()
+        # TODO: Add error handling for API requests and invalid responses
+        response = self.client.responses.create(
+            model=self.MODEL,
+            instructions=query,
+            input=context,
+        )
+        duration = time.time() - start_time
+
+        return (
+            response.output_text,
+            response.usage,
+            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
+            duration,
+        )
+
+
+class GPT41NanoHandler(BaseModelHandler):
+    MODEL = "gpt-4.1-nano"
+    # Model Pricing: https://platform.openai.com/docs/pricing
+    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.10, "output": 0.40}
+
+    def __init__(self) -> None:
+        self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+
+    def handle_request(
+        self, query: str, context: str
+    ) -> tuple[str, dict[str, int], dict[str, float], float]:
+        """
+        Handles the request to the GPT-4.1 Nano model
+
+        Args:
+            query: The user query to be processed
+            context: The context or document content to be used
+
+        """
+        start_time = time.time()
+        # TODO: Add error handling for API requests and invalid responses
+
+        # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide
+
+        # Long context performance can degrade as more items are required to be retrieved, 
+        # or perform complex reasoning that requires knowledge of the state of the entire context
+
+        """
+        
+        # Role and Objective
+
+        - You are a seasoned physician or medical professional who treats patients with bipolar disorder
+        - You are analyzing medical research by processing peer-reviewed papers to extract key details
+
+        # Instructions
+
+        - Identify rules for medication inclusion or exclusion based on medical history or concerns 
+
+        - Only use the documents in the provided External Context to answer the User Query. 
+        If you don't know the answer based on this context, you must respond 
+        "I don't have the information needed to answer that", even if a user insists on you answering the question.
+
+        - Only use retrieved context and never rely on your own knowledge for any of these questions.
+
+        - Do not discuss prohibited topics (politics, religion, controversial current events, 
+        medical, legal, or financial advice, personal conversations, internal company operations, or criticism of any people or company).
+
+        - Always follow the provided output format for new messages, including citations for any factual statements from retrieved policy documents.
+
+        # Output Format
+
+        The rule is history of suicide attempts. The type of rule is "INCLUDE". The reason is lithium is the 
+        only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder.
+        The medications for this rule are lithium.
+        
+        The rule is weight gain concerns. The type of rule is "EXCLUDE". The reason is Seroquel, Risperdal, Abilify, and 
+        Zyprexa are known for causing weight gain. The medications for this rule are Quetiapine, Aripiprazole, Olanzapine, Risperidone
+
+        For each rule you find, return a JSON object using the following format:
+
+        {
+            "rule": "<condition or concern>",
+            "type": "INCLUDE" or "EXCLUDE",
+            "reason": "<short explanation for why this rule applies>",
+            "medications": ["<medication 1>", "<medication 2>", ...],
+            "source": "<chunk-X>"
+        }
+
+        - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). 
+        Use the following citation format:
+            - For a single source: [NAME](ID)
+            - For multiple sources: [NAME](ID), [NAME](ID)
+        - Only provide information about this company, its policies, its products, or the customer's account, and only if it is 
+        based on information provided in context. Do not answer questions outside this scope.
+
+
+        # Examples
+
+
+        # Context
+
+        ID: 1 | TITLE: The Fox | CONTENT: The quick brown fox jumps over the lazy dog
+
+        # Final instructions and prompt to think step by step
+
+        - Identify rules for medication inclusion or exclusion based on medical history or concerns 
+
+        - Only use the documents in the provided External Context to answer the User Query. 
+        If you don't know the answer based on this context, you must respond 
+        "I don't have the information needed to answer that", even if a user insists on you answering the question.
+
+        """
+
+
+
+        response = self.client.responses.create(
+            model=self.MODEL,
+            instructions=query,
+            input=context,
+        )
+        duration = time.time() - start_time
+
+        return (
+            response.output_text,
+            response.usage,
+            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
+            duration,
+        )
+
+
+class ModelFactory:
+    HANDLERS = {
+        "CLAUDE_HAIKU_3_5_CITATIONS": ClaudeHaiku35CitationsHandler,
+        "CLAUDE_HAIKU_3": ClaudeHaiku3Handler,
+        "GPT_4O_MINI": GPT4OMiniHandler,
+        "GPT_41_NANO": GPT41NanoHandler,
+    }
+
+    # HANDLERS doesn't vary per instance so we can use a class method
+    @classmethod
+    def get_handler(cls, model_name: str) -> BaseModelHandler | None:
+        """
+        Factory method to get the appropriate model handler based on the model name
+
+        Args:
+            model_name (str): The name of the model for which to get the handler.
+        Returns:
+            BaseModelHandler: An instance of the appropriate model handler class.
+        """
+
+        handler_class = cls.HANDLERS.get(model_name)
+        if handler_class:
+            return handler_class()
+        else:
+            logging.error(f"Unsupported model: {model_name}")
+            return None
diff --git a/server/api/views/text_extraction/views.py b/server/api/views/text_extraction/views.py
index 92b34c9c..79b916ab 100644
--- a/server/api/views/text_extraction/views.py
+++ b/server/api/views/text_extraction/views.py
@@ -1,5 +1,7 @@
 import os
-from ...services.openai_services import openAIServices
+import json
+import re
+
 from rest_framework.views import APIView
 from rest_framework.permissions import IsAuthenticated
 from rest_framework.response import Response
@@ -7,16 +9,28 @@
 from django.utils.decorators import method_decorator
 from django.views.decorators.csrf import csrf_exempt
 import anthropic
-import json
-import re
+
+from ...services.openai_services import openAIServices
 from api.models.model_embeddings import Embeddings
 
 
-# TODO: Add docstrings and type hints
-def anthropic_citations(client, content_chunks, user_prompt):
+def anthropic_citations(client: anthropic.Client, user_prompt: str, content_chunks: list) -> tuple:
     """
+    Sends a message to Anthropic Citations and extract and format the response 
+
+    Parameters
+    ----------
+    client: An instance of the Anthropic API client used to make the request
+    user_prompt: The user's question or instruction to be processed by the model
+    content_chunks: A list of text chunks that provide context for the model to use during generation
+
+    Returns
+    -------
+    tuple
+
     """
 
+
     message = client.messages.create(
         model="claude-3-5-haiku-20241022",
         max_tokens=1024,
@@ -93,6 +107,7 @@ def get(self, request):
 
             query = Embeddings.objects.filter(upload_file__guid=guid)
 
+            # TODO: Format into the Anthropic API"s expected input format in the anthropic_citations function
             chunks = [{"type": "text", "text": chunk.text} for chunk in query]
 
             texts, cited_texts = anthropic_citations(

From d47606726c9628eeb01a774b3c374c6e808f3efa Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Thu, 19 Jun 2025 13:26:11 -0400
Subject: [PATCH 02/15] HOTFIX ModuleNotFoundError

---
 server/api/views/uploadFile/title.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/api/views/uploadFile/title.py b/server/api/views/uploadFile/title.py
index 637607a2..93453607 100644
--- a/server/api/views/uploadFile/title.py
+++ b/server/api/views/uploadFile/title.py
@@ -2,7 +2,7 @@
 
 import fitz
 
-from server.api.services.openai_services import openAIServices
+from ...services.openai_services import openAIServices
 
 # regular expression to match common research white paper titles. Created by Chat-gpt
 # requires at least 3 words, no dates, no version numbers.

From b18391980bc75b76572d94442cbfe87267df8e4c Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Thu, 19 Jun 2025 19:38:19 -0400
Subject: [PATCH 03/15] Refactor evaluation script and GPT-4.1 Nano handler for
 cleaner logic and improved defaults

evaluation/evals.py:

Removed unused Reference column from input requirements and function signature.

Added TODOs to improve flexibility and support defaults for instructions.

Suggested running the script with uv for dependency management.

llm_services.py:

Refactored GPT41NanoHandler:

Moved long prompt instructions into a class-level INSTRUCTIONS string.

Simplified and cleaned up the handle_request method.

Added default fallback behavior: uses INSTRUCTIONS if no query is provided.

Removed duplicate and commented-out prompt scaffolding.
---
 evaluation/evals.py                 |   9 ++-
 server/api/services/llm_services.py | 102 +++++++++-------------------
 2 files changed, 37 insertions(+), 74 deletions(-)

diff --git a/evaluation/evals.py b/evaluation/evals.py
index f6e9bb3d..a263d3bc 100644
--- a/evaluation/evals.py
+++ b/evaluation/evals.py
@@ -2,6 +2,8 @@
 Evaluate LLM outputs using multiple metrics and compute associated costs
 """
 
+#TODO: Run this script with uv to manage dependencies
+
 # TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
 
 import sys
@@ -25,7 +27,7 @@
 
 
 def evaluate_response(
-    model_name: str, query: str, context: str, reference: str
+    model_name: str, query: str, context: str
 ) -> pd.DataFrame:
     """
     Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost
@@ -95,6 +97,7 @@ def evaluate_response(
     df_config.columns = df_config.columns.str.strip()
 
     # Check if the required columns are present
+    # TODO: Make this more flexible by allowing the user to use default instructions
     required_columns = ["Model Name", "Query"]
     if not all(col in df_config.columns for col in required_columns):
         raise ValueError(
@@ -117,7 +120,7 @@ def evaluate_response(
     # Remove the trailing whitespace from column names
     df_reference.columns = df_reference.columns.str.strip()
     # Check if the required columns are present
-    required_columns = ["Context", "Reference"]
+    required_columns = ["Context"]
     if not all(col in df_reference.columns for col in required_columns):
         raise ValueError(
             f"Reference DataFrame must contain the following columns: {required_columns}"
@@ -133,7 +136,7 @@ def evaluate_response(
             [
                 df_evals,
                 evaluate_response(
-                    row["Model Name"], row["Query"], row["Context"], row["Reference"]
+                    row["Model Name"], row["Query"], row["Context"]
                 ),
             ],
             axis=0,
diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py
index 7ab9c176..049451f1 100644
--- a/server/api/services/llm_services.py
+++ b/server/api/services/llm_services.py
@@ -187,98 +187,58 @@ def handle_request(
 
 class GPT41NanoHandler(BaseModelHandler):
     MODEL = "gpt-4.1-nano"
+
     # Model Pricing: https://platform.openai.com/docs/pricing
     PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.10, "output": 0.40}
 
-    def __init__(self) -> None:
-        self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-
-    def handle_request(
-        self, query: str, context: str
-    ) -> tuple[str, dict[str, int], dict[str, float], float]:
-        """
-        Handles the request to the GPT-4.1 Nano model
-
-        Args:
-            query: The user query to be processed
-            context: The context or document content to be used
-
-        """
-        start_time = time.time()
-        # TODO: Add error handling for API requests and invalid responses
-
-        # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide
-
-        # Long context performance can degrade as more items are required to be retrieved, 
-        # or perform complex reasoning that requires knowledge of the state of the entire context
-
-        """
-        
-        # Role and Objective
-
-        - You are a seasoned physician or medical professional who treats patients with bipolar disorder
-        - You are analyzing medical research by processing peer-reviewed papers to extract key details
-
-        # Instructions
-
-        - Identify rules for medication inclusion or exclusion based on medical history or concerns 
-
-        - Only use the documents in the provided External Context to answer the User Query. 
-        If you don't know the answer based on this context, you must respond 
-        "I don't have the information needed to answer that", even if a user insists on you answering the question.
-
-        - Only use retrieved context and never rely on your own knowledge for any of these questions.
+    # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide
 
-        - Do not discuss prohibited topics (politics, religion, controversial current events, 
-        medical, legal, or financial advice, personal conversations, internal company operations, or criticism of any people or company).
+    # Long context performance can degrade as more items are required to be retrieved, 
+    # or perform complex reasoning that requires knowledge of the state of the entire context
 
-        - Always follow the provided output format for new messages, including citations for any factual statements from retrieved policy documents.
-
-        # Output Format
-
-        The rule is history of suicide attempts. The type of rule is "INCLUDE". The reason is lithium is the 
-        only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder.
-        The medications for this rule are lithium.
+    INSTRUCTIONS = """
         
-        The rule is weight gain concerns. The type of rule is "EXCLUDE". The reason is Seroquel, Risperdal, Abilify, and 
-        Zyprexa are known for causing weight gain. The medications for this rule are Quetiapine, Aripiprazole, Olanzapine, Risperidone
+    # Role and Objective
 
-        For each rule you find, return a JSON object using the following format:
+    - You are a seasoned physician or medical professional who treats patients with bipolar disorder
+    - You are analyzing medical research by processing peer-reviewed papers to extract key details
 
-        {
-            "rule": "<condition or concern>",
-            "type": "INCLUDE" or "EXCLUDE",
-            "reason": "<short explanation for why this rule applies>",
-            "medications": ["<medication 1>", "<medication 2>", ...],
-            "source": "<chunk-X>"
-        }
+    # Instructions
 
-        - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). 
-        Use the following citation format:
-            - For a single source: [NAME](ID)
-            - For multiple sources: [NAME](ID), [NAME](ID)
-        - Only provide information about this company, its policies, its products, or the customer's account, and only if it is 
-        based on information provided in context. Do not answer questions outside this scope.
+    - Identify rules for medication inclusion or exclusion based on medical history or concerns 
 
+    - Only use retrieved context and never rely on your own knowledge for any of these questions.
+    - Always follow the provided output format for new messages including citations for any factual statements 
 
-        # Examples
+    # Output Format
 
+    - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). 
 
-        # Context
 
-        ID: 1 | TITLE: The Fox | CONTENT: The quick brown fox jumps over the lazy dog
+    """
 
-        # Final instructions and prompt to think step by step
+    def __init__(self) -> None:
+        self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 
-        - Identify rules for medication inclusion or exclusion based on medical history or concerns 
+    def handle_request(
+        self, query: str, context: str
+    ) -> tuple[str, dict[str, int], dict[str, float], float]:
+        """
+        Handles the request to the GPT-4.1 Nano model
 
-        - Only use the documents in the provided External Context to answer the User Query. 
-        If you don't know the answer based on this context, you must respond 
-        "I don't have the information needed to answer that", even if a user insists on you answering the question.
+        Args:
+            query: The user query to be processed
+            context: The context or document content to be used
 
         """
 
+        # If no query is provided, use the default instructions
+        if not query:
+            query = self.INSTRUCTIONS
+
 
+        start_time = time.time()
+        # TODO: Add error handling for API requests and invalid responses
 
         response = self.client.responses.create(
             model=self.MODEL,

From 6c592be5f9430762bb19049da1cbc15650cfd489 Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Thu, 3 Jul 2025 13:21:19 -0400
Subject: [PATCH 04/15] Update evaluation README with example scripts and
 remove obsolete Claude API client code from service module.

---
 evaluation/README.md                | 169 ++++++++++++++++++++++++-
 server/api/services/llm_services.py | 187 +++++++---------------------
 2 files changed, 210 insertions(+), 146 deletions(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index a1d0ad70..5e088880 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -9,9 +9,10 @@ It supports batch evalaution via a configuration CSV and produces a detailed met
 
 ### Usage
 
-This script evaluates LLM outputs using the `lighteval` library: https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
+This script evaluates LLM outputs using the `lighteval` library:
+https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
 
-Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI, Anthropic) configured properly.
+Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI) configured properly.
 
 
 ```bash
@@ -21,17 +22,175 @@ python evals.py --config path/to/config.csv --reference path/to/reference.csv --
 The arguments to the script are:
 
 - Path to the config CSV file: Must include the columns "Model Name" and "Query"
+
+```
+import pandas as pd
+
+# Define the data
+data = [
+
+    {
+      "Model Name": "GPT_4O_MINI",
+      "Query": """
+      You're analyzing medical text from multiple sources. Each chunk is labeled [chunk-X].
+
+      Act as a seasoned physician or medical professional who treats patients with bipolar disorder.
+
+      Identify rules for medication inclusion or exclusion based on medical history or concerns.
+
+      For each rule you find, return a JSON object using the following format:
+
+      {
+        "rule": "<condition or concern>",
+        "type": "INCLUDE" or "EXCLUDE",
+        "reason": "<short explanation for why this rule applies>",
+        "medications": ["<medication 1>", "<medication 2>", ...],
+        "source": "<chunk-X>"
+      }
+
+      Only include rules that are explicitly stated or strongly implied in the chunk.
+
+      Only use the chunks provided. If no rule is found in a chunk, skip it.
+
+      Return the entire output as a JSON array.
+      """
+    },
+
+    {
+        "Model Name": "GPT_41_NANO",
+        "Query": """
+        
+    # Role and Objective
+    
+    - You are a seasoned physician or medical professional who is developing a bipolar disorder treatment algorithim
+
+    - You are extracting bipolar medication decision points from a research paper that is chunked into multiple parts each labeled with an ID
+
+    # Instructions
+
+    - Identify decision points for bipolar medications
+
+    - For each decision point you find, return a JSON object using the following format:
+
+        {
+            "criterion": "<condition or concern>",
+            "decision": "INCLUDE" or "EXCLUDE",
+            "medications": ["<medication 1>", "<medication 2>", ...],
+            "reason": "<short explanation for why this criterion applies>",
+            "sources": ["<ID-X>"]
+        }
+
+
+    - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge
+
+    # Output Format
+
+    - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array
+
+    # Example
+
+    [
+        {
+            "criterion": "History of suicide attempts",
+            "decision": "INCLUDE",
+            "medications": ["Lithium"],
+            "reason": "Lithium is the only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder",
+            "sources": ["ID-0"]
+        },
+        {
+            "criterion": "Weight gain concerns",
+            "decision": "EXCLUDE",
+            "medications": ["Quetiapine", "Aripiprazole", "Olanzapine", "Risperidone"],
+            "reason": "Seroquel, Risperdal, Abilify, and Zyprexa are known for causing weight gain",
+            "sources": ["ID-0", "ID-1", "ID-2"]
+        }
+    ]
+
+    """
+        
+    },
+]
+
+# Create DataFrame from records
+df = pd.DataFrame.from_records(data)
+
+# Write to CSV
+df.to_csv("~/Desktop/evals_config.csv", index=False)
+```
+
+
 - Path to the reference CSV file: Must include the columns "Context" and "Reference"
+
+```
+from sqlalchemy import create_engine
+import pandas as pd
+
+engine = create_engine("postgresql+psycopg2://balancer:balancer@localhost:5433/balancer_dev")
+# Filter out papers that shouldn't be used from local database
+query = "SELECT * FROM api_embeddings WHERE date_of_upload > '2025-03-14';"
+df = pd.read_sql(query, engine)
+
+df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1)
+# Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining
+df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
+df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index()
+df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'})
+df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False)
+```
+
 - Path where the evaluation resuls will be saved
 
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+df = pd.read_csv("~/Desktop/evals_out-20250702.csv")
+
+# Define the metrics of interest
+extractiveness_cols = ['Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression']
+token_cols = ['Input Token Usage', 'Output Token Usage']
+other_metrics = ['Cost (USD)', 'Duration (s)']
+all_metrics = extractiveness_cols + token_cols + other_metrics
+
+# Metric histograms by model
+plt.style.use('default')
+fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics)))
+
+models = df['Model Name'].unique()
+colors = plt.cm.Set3(np.linspace(0, 1, len(models)))
+
+for i, metric in enumerate(all_metrics):
+    ax = axes[i] if len(all_metrics) > 1 else axes
+    
+    # Create histogram for each model
+    for j, model in enumerate(models):
+        model_data = df[df['Model Name'] == model][metric]
+        ax.hist(model_data, alpha=0.7, label=model, bins=min(8, len(model_data)), 
+                color=colors[j], edgecolor='black', linewidth=0.5)
+    
+    ax.set_title(f'{metric} Distribution by Model', fontsize=14, fontweight='bold')
+    ax.set_xlabel(metric, fontsize=12)
+    ax.set_ylabel('Frequency', fontsize=12)
+    ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
+    ax.grid(True, alpha=0.3)
+
+plt.tight_layout()
+plt.show()
+
+#TODO: Compute count, min, quantiles and max by model
+#TODO: Calculate efficiency metrics: Totel Token Usage, Cost per Token, Tokens per Second, Cost per Second
+
+
+
 
 The script outputs a CSV with the following columns:
 
 * Evaluates LLM outputs for:
 
-  * Extractiveness Coverage
-  * Extractiveness Density
-  * Extractiveness Compression
+  * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article
+  * Extractiveness Density: Average length of the extractive fragement to which each word in the summary belongs
+  * Extractiveness Compression: Word ratio between the article and the summary
 
 * Computes:
 
diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py
index 049451f1..4e55690d 100644
--- a/server/api/services/llm_services.py
+++ b/server/api/services/llm_services.py
@@ -7,10 +7,8 @@
 import logging
 from abc import ABC, abstractmethod
 
-import anthropic
 import openai
 
-
 class BaseModelHandler(ABC):
     @abstractmethod
     def handle_request(
@@ -18,136 +16,7 @@ def handle_request(
     ) -> tuple[str, dict[str, int], dict[str, float], float]:
         pass
 
-
-class ClaudeHaiku35CitationsHandler(BaseModelHandler):
-    MODEL = "claude-3-5-haiku-20241022"
-    # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
-    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.80, "output": 4.00}
-
-    def __init__(self) -> None:
-        self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
-
-    def handle_request(
-        self, query: str, context: str
-    ) -> tuple[str, dict[str, int], dict[str, float], float]:
-        """
-        Handles the request to the Claude Haiku 3.5 model with citations enabled
-
-        Args:
-            query: The user query to be processed
-            context: The context or document content to be used for citations
-
-        """
-
-        start_time = time.time()
-        # TODO: Add error handling for API requests and invalid responses
-        message = self.client.messages.create(
-            model=self.MODEL,
-            max_tokens=1024,
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": query},
-                        {
-                            "type": "document",
-                            "source": {"type": "content", "content": context},
-                            "citations": {"enabled": True},
-                        },
-                    ],
-                }
-            ],
-        )
-        duration = time.time() - start_time
-
-        # Response Structure: https://docs.anthropic.com/en/docs/build-with-claude/citations#response-structure
-
-        text = []
-        cited_text = []
-        for content in message.to_dict()["content"]:
-            text.append(content["text"])
-            if "citations" in content.keys():
-                text.append(
-                    " ".join(
-                        [
-                            f"<{citation['start_block_index']} - {citation['end_block_index']}>"
-                            for citation in content["citations"]
-                        ]
-                    )
-                )
-                cited_text.append(
-                    " ".join(
-                        [
-                            f"<{citation['start_block_index']} - {citation['end_block_index']}> {citation['cited_text']}"
-                            for citation in content["citations"]
-                        ]
-                    )
-                )
-
-        full_text = " ".join(text)
-
-        return (
-            full_text,
-            message.usage,
-            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
-            duration,
-        )
-
-
-class ClaudeHaiku3Handler(BaseModelHandler):
-    MODEL = "claude-3-haiku-20240307"
-    # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
-    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.25, "output": 1.25}
-
-    def __init__(self) -> None:
-        self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
-
-    def handle_request(
-        self, query: str, context: str
-    ) -> tuple[str, dict[str, int], dict[str, float], float]:
-        """
-        Handles the request to the Claude Haiku 3 model with citations disabled
-
-        Args:
-            query: The user query to be processed
-            context: The context or document content to be used
-
-        """
-
-        start_time = time.time()
-        # TODO: Add error handling for API requests and invalid responses
-        message = self.client.messages.create(
-            model=self.MODEL,
-            max_tokens=1024,
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": query},
-                        {
-                            "type": "document",
-                            "source": {"type": "content", "content": context},
-                            "citations": {"enabled": False},
-                        },
-                    ],
-                }
-            ],
-        )
-        duration = time.time() - start_time
-
-        text = []
-        for content in message.to_dict()["content"]:
-            text.append(content["text"])
-
-        full_text = " ".join(text)
-
-        return (
-            full_text,
-            message.usage,
-            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
-            duration,
-        )
-
+ # Anthropic  Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
 
 class GPT4OMiniHandler(BaseModelHandler):
     MODEL = "gpt-4o-mini"
@@ -174,6 +43,7 @@ def handle_request(
             model=self.MODEL,
             instructions=query,
             input=context,
+            temperature=0.0
         )
         duration = time.time() - start_time
 
@@ -196,24 +66,55 @@ class GPT41NanoHandler(BaseModelHandler):
     # Long context performance can degrade as more items are required to be retrieved, 
     # or perform complex reasoning that requires knowledge of the state of the entire context
 
+    #
+
     INSTRUCTIONS = """
         
     # Role and Objective
+    
+    - You are a seasoned physician or medical professional who is developing a bipolar disorder treatment algorithim
 
-    - You are a seasoned physician or medical professional who treats patients with bipolar disorder
-    - You are analyzing medical research by processing peer-reviewed papers to extract key details
+    - You are extracting bipolar medication decision points from a research paper that is chunked into multiple parts each labeled with an ID
 
     # Instructions
 
-    - Identify rules for medication inclusion or exclusion based on medical history or concerns 
+    - Identify decision points for bipolar medications
 
-    - Only use retrieved context and never rely on your own knowledge for any of these questions.
-    - Always follow the provided output format for new messages including citations for any factual statements 
+    - For each decision point you find, return a JSON object using the following format:
+
+        {
+            "criterion": "<condition or concern>",
+            "decision": "INCLUDE" or "EXCLUDE",
+            "medications": ["<medication 1>", "<medication 2>", ...],
+            "reason": "<short explanation for why this criterion applies>",
+            "sources": ["<ID-X>"]
+        }
 
-    # Output Format
 
-    - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). 
+    - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge
+
+    # Output Format
 
+    - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array
+
+    # Example
+
+    [
+        {
+            "criterion": "History of suicide attempts",
+            "decision": "INCLUDE",
+            "medications": ["Lithium"],
+            "reason": "Lithium is the only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder",
+            "sources": ["ID-0"]
+        },
+        {
+            "criterion": "Weight gain concerns",
+            "decision": "EXCLUDE",
+            "medications": ["Quetiapine", "Aripiprazole", "Olanzapine", "Risperidone"],
+            "reason": "Seroquel, Risperdal, Abilify, and Zyprexa are known for causing weight gain",
+            "sources": ["ID-0", "ID-1", "ID-2"]
+        }
+    ]
 
     """
 
@@ -244,6 +145,7 @@ def handle_request(
             model=self.MODEL,
             instructions=query,
             input=context,
+            temperature=0.0
         )
         duration = time.time() - start_time
 
@@ -256,9 +158,12 @@ def handle_request(
 
 
 class ModelFactory:
+
+    #TODO: Define structured fields to extract from unstructured input data
+    #https://platform.openai.com/docs/guides/structured-outputs?api-mode=responses&example=structured-data#examples
+
+
     HANDLERS = {
-        "CLAUDE_HAIKU_3_5_CITATIONS": ClaudeHaiku35CitationsHandler,
-        "CLAUDE_HAIKU_3": ClaudeHaiku3Handler,
         "GPT_4O_MINI": GPT4OMiniHandler,
         "GPT_41_NANO": GPT41NanoHandler,
     }

From 6e41cf7dc23a47c19575526d3b9837c51553b97a Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Thu, 3 Jul 2025 13:22:57 -0400
Subject: [PATCH 05/15] Refactor README.md: add TODOs

---
 evaluation/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index 5e088880..aab19564 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -179,13 +179,13 @@ plt.tight_layout()
 plt.show()
 
 #TODO: Compute count, min, quantiles and max by model
-#TODO: Calculate efficiency metrics: Totel Token Usage, Cost per Token, Tokens per Second, Cost per Second
-
-
+#TODO: Calculate efficiency metrics: Total Token Usage, Cost per Token, Tokens per Second, Cost per Second
 
 
 The script outputs a CSV with the following columns:
 
+#TODO: Summarize https://aclanthology.org/N18-1065.pdf
+
 * Evaluates LLM outputs for:
 
   * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article
@@ -196,4 +196,4 @@ The script outputs a CSV with the following columns:
 
   * Token usage (input/output)
   * Estimated cost in USD
-  * Duration (in seconds)
+  * Duration (in seconds)
\ No newline at end of file

From c483e693a78990f2fbb6f86a410eb41c1e21df27 Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Mon, 7 Jul 2025 21:00:44 -0400
Subject: [PATCH 06/15] DOC Add TODO items, update comments and improve code
 comments for clarity

---
 evaluation/README.md                | 35 +++++++++++++++++++++++++++++
 server/api/services/llm_services.py |  8 ++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index aab19564..f38097e6 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -1,6 +1,8 @@
 
 # Evaluations
 
+#TODO: Open AI evals documentaiton: https://platform.openai.com/docs/guides/evals
+
 ## LLM Output Evaluator
 
 The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost.
@@ -12,6 +14,8 @@ It supports batch evalaution via a configuration CSV and produces a detailed met
 This script evaluates LLM outputs using the `lighteval` library:
 https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
 
+##TODO: Use uv to execute scripts without manually manging enviornments https://docs.astral.sh/uv/guides/scripts/
+
 Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI) configured properly.
 
 
@@ -138,6 +142,37 @@ df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'
 df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False)
 ```
 
+```
+echo 'export PATH="/Applications/Postgres.app/Contents/Versions/latest/bin:$PATH"' >> ~/.zshrc
+source ~/.zshrc
+
+createdb backupDBBalancer07012025
+pg_restore -v -d backupDBBalancer07012025 ~/Downloads/backupDBBalancer07012025.sql
+
+pip install psycopg2-binary
+
+from sqlalchemy import create_engine
+import pandas as pd
+
+# Alternative: Standard psycopg2 connection (if you get psycopg2 working)
+# engine = create_engine("postgresql://sahildshah@localhost:5432/backupDBBalancer07012025")
+
+# Fixed the variable name (was "database query", now "query")
+query = "SELECT * FROM api_embeddings;"
+
+# Execute the query and load into DataFrame
+df = pd.read_sql(query, engine)
+
+df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1)
+# Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining
+df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
+df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index()
+df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'})
+df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False)
+```
+
+
+
 - Path where the evaluation resuls will be saved
 
 import pandas as pd
diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py
index 4e55690d..55627cf1 100644
--- a/server/api/services/llm_services.py
+++ b/server/api/services/llm_services.py
@@ -16,6 +16,8 @@ def handle_request(
     ) -> tuple[str, dict[str, int], dict[str, float], float]:
         pass
 
+# LLM Pricing Calculator: https://www.llm-prices.com/
+
  # Anthropic  Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
 
 class GPT4OMiniHandler(BaseModelHandler):
@@ -78,7 +80,7 @@ class GPT41NanoHandler(BaseModelHandler):
 
     # Instructions
 
-    - Identify decision points for bipolar medications
+    - Identify decision points for bipolar medications #TODO: "pharmacological and procedurl interventions" 
 
     - For each decision point you find, return a JSON object using the following format:
 
@@ -88,11 +90,15 @@ class GPT41NanoHandler(BaseModelHandler):
             "medications": ["<medication 1>", "<medication 2>", ...],
             "reason": "<short explanation for why this criterion applies>",
             "sources": ["<ID-X>"]
+            "hierarchy": Primary: Contraindictions for allergies
+            "override" Exclude for allergy
         }
 
 
     - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge
 
+    - TODO: Test against medication indication file 
+
     # Output Format
 
     - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array

From c03d990a21fd60fc219fba5a3d9c3b7c4d98e2b5 Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Wed, 9 Jul 2025 16:27:17 -0400
Subject: [PATCH 07/15] Update README with detailed usage instructions and
 enhance evals.py to include environment setup and dependencies

---
 evaluation/README.md                | 193 +++++++++-------------------
 evaluation/evals.py                 |  18 ++-
 server/api/services/llm_services.py |   4 +-
 3 files changed, 80 insertions(+), 135 deletions(-)
 mode change 100644 => 100755 evaluation/evals.py

diff --git a/evaluation/README.md b/evaluation/README.md
index f38097e6..5e95aaab 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -1,31 +1,36 @@
-
 # Evaluations
 
-#TODO: Open AI evals documentaiton: https://platform.openai.com/docs/guides/evals
-
 ## LLM Output Evaluator
 
-The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost.
+The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost
 
-It supports batch evalaution via a configuration CSV and produces a detailed metrics report in CSV format.
+This script helps teams compare LLM outputs using extractiveness metrics, token usage, and cost. It is especially useful for evaluating multiple models over a batch of queries and reference answers.
 
-### Usage
+It supports batch evaluation via a configuration CSV and produces a detailed metrics report in CSV format.
 
-This script evaluates LLM outputs using the `lighteval` library:
-https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
+### Usage
 
-##TODO: Use uv to execute scripts without manually manging enviornments https://docs.astral.sh/uv/guides/scripts/
+Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments:
 
-Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI) configured properly.
+```sh
+uv run evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
+```
 
+Execute without using uv run by ensuring it is executable:
 
-```bash
-python evals.py --config path/to/config.csv --reference path/to/reference.csv --output path/to/results.csv
+```sh
+./evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
 ```
 
 The arguments to the script are:
 
 - Path to the config CSV file: Must include the columns "Model Name" and "Query"
+- Path to the reference CSV file: Must include the columns "Context" and "Reference"
+- Path where the evaluation results will be saved
+
+### Configuration File
+
+Generate the config CSV file:
 
 ```
 import pandas as pd
@@ -34,84 +39,13 @@ import pandas as pd
 data = [
 
     {
-      "Model Name": "GPT_4O_MINI",
-      "Query": """
-      You're analyzing medical text from multiple sources. Each chunk is labeled [chunk-X].
-
-      Act as a seasoned physician or medical professional who treats patients with bipolar disorder.
-
-      Identify rules for medication inclusion or exclusion based on medical history or concerns.
-
-      For each rule you find, return a JSON object using the following format:
-
-      {
-        "rule": "<condition or concern>",
-        "type": "INCLUDE" or "EXCLUDE",
-        "reason": "<short explanation for why this rule applies>",
-        "medications": ["<medication 1>", "<medication 2>", ...],
-        "source": "<chunk-X>"
-      }
-
-      Only include rules that are explicitly stated or strongly implied in the chunk.
-
-      Only use the chunks provided. If no rule is found in a chunk, skip it.
-
-      Return the entire output as a JSON array.
-      """
+      "Model Name": "<MODEL_NAME_1>",
+      "Query": """<YOUR_QUERY_1>"""
     },
 
     {
-        "Model Name": "GPT_41_NANO",
-        "Query": """
-        
-    # Role and Objective
-    
-    - You are a seasoned physician or medical professional who is developing a bipolar disorder treatment algorithim
-
-    - You are extracting bipolar medication decision points from a research paper that is chunked into multiple parts each labeled with an ID
-
-    # Instructions
-
-    - Identify decision points for bipolar medications
-
-    - For each decision point you find, return a JSON object using the following format:
-
-        {
-            "criterion": "<condition or concern>",
-            "decision": "INCLUDE" or "EXCLUDE",
-            "medications": ["<medication 1>", "<medication 2>", ...],
-            "reason": "<short explanation for why this criterion applies>",
-            "sources": ["<ID-X>"]
-        }
-
-
-    - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge
-
-    # Output Format
-
-    - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array
-
-    # Example
-
-    [
-        {
-            "criterion": "History of suicide attempts",
-            "decision": "INCLUDE",
-            "medications": ["Lithium"],
-            "reason": "Lithium is the only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder",
-            "sources": ["ID-0"]
-        },
-        {
-            "criterion": "Weight gain concerns",
-            "decision": "EXCLUDE",
-            "medications": ["Quetiapine", "Aripiprazole", "Olanzapine", "Risperidone"],
-            "reason": "Seroquel, Risperdal, Abilify, and Zyprexa are known for causing weight gain",
-            "sources": ["ID-0", "ID-1", "ID-2"]
-        }
-    ]
-
-    """
-        
+        "Model Name": "<MODEL_NAME_2>",
+        "Query": """<YOUR_QUERY_2>"""
     },
 ]
 
@@ -119,68 +53,79 @@ data = [
 df = pd.DataFrame.from_records(data)
 
 # Write to CSV
-df.to_csv("~/Desktop/evals_config.csv", index=False)
+df.to_csv("<CONFIG_CSV_PATH>", index=False)
 ```
 
 
-- Path to the reference CSV file: Must include the columns "Context" and "Reference"
+### Reference File
+
+Generate the reference file by connecting to a database of references
+
+Connect to the Postgres database of your local Balancer instance:
 
 ```
 from sqlalchemy import create_engine
-import pandas as pd
 
 engine = create_engine("postgresql+psycopg2://balancer:balancer@localhost:5433/balancer_dev")
-# Filter out papers that shouldn't be used from local database
-query = "SELECT * FROM api_embeddings WHERE date_of_upload > '2025-03-14';"
-df = pd.read_sql(query, engine)
-
-df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1)
-# Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining
-df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
-df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index()
-df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'})
-df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False)
 ```
 
+Connect to the Postgres database of the production Balancer instance using a SQL file:
+
 ```
+# Install Postgres.app and add binaries to the PATH 
 echo 'export PATH="/Applications/Postgres.app/Contents/Versions/latest/bin:$PATH"' >> ~/.zshrc
-source ~/.zshrc
 
-createdb backupDBBalancer07012025
-pg_restore -v -d backupDBBalancer07012025 ~/Downloads/backupDBBalancer07012025.sql
+createdb <DB_NAME>
+pg_restore -v -d <DB_NAME> <PATH_TO_BACKUP>.sql
 
-pip install psycopg2-binary
+engine = create_engine("postgresql://<USER>@localhost:5432/<DB_NAME>")
+```
 
-from sqlalchemy import create_engine
-import pandas as pd
+Generate the reference CSV file:
 
-# Alternative: Standard psycopg2 connection (if you get psycopg2 working)
-# engine = create_engine("postgresql://sahildshah@localhost:5432/backupDBBalancer07012025")
+```
+import pandas as pd
 
-# Fixed the variable name (was "database query", now "query")
 query = "SELECT * FROM api_embeddings;"
-
-# Execute the query and load into DataFrame
 df = pd.read_sql(query, engine)
 
 df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1)
+
 # Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining
 df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
 df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index()
+
 df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'})
-df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False)
+df_grouped.to_csv('<REFERENCE_CSV_PATH>', index=False)
 ```
 
+### Output File
+
+The script outputs a CSV with the following columns:
+
+Extractiveness Metrics based on the methodology from: https://aclanthology.org/N18-1065.pdf
+
+* Evaluates LLM outputs for:
+
+  * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article
+  * Extractiveness Density: Average length of the extractive fragment to which each word in the summary belongs
+  * Extractiveness Compression: Word ratio between the article and the summary
+
+* Computes:
 
+  * Token usage (input/output)
+  * Estimated cost in USD
+  * Duration (in seconds)
 
-- Path where the evaluation resuls will be saved
 
+Exploratory data analysis:
+
+```
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 
-
-df = pd.read_csv("~/Desktop/evals_out-20250702.csv")
+df = pd.read_csv("<OUTPUT_CSV_PATH>")
 
 # Define the metrics of interest
 extractiveness_cols = ['Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression']
@@ -213,22 +158,6 @@ for i, metric in enumerate(all_metrics):
 plt.tight_layout()
 plt.show()
 
-#TODO: Compute count, min, quantiles and max by model
 #TODO: Calculate efficiency metrics: Total Token Usage, Cost per Token, Tokens per Second, Cost per Second
 
-
-The script outputs a CSV with the following columns:
-
-#TODO: Summarize https://aclanthology.org/N18-1065.pdf
-
-* Evaluates LLM outputs for:
-
-  * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article
-  * Extractiveness Density: Average length of the extractive fragement to which each word in the summary belongs
-  * Extractiveness Compression: Word ratio between the article and the summary
-
-* Computes:
-
-  * Token usage (input/output)
-  * Estimated cost in USD
-  * Duration (in seconds)
\ No newline at end of file
+```
\ No newline at end of file
diff --git a/evaluation/evals.py b/evaluation/evals.py
old mode 100644
new mode 100755
index a263d3bc..9e597d3f
--- a/evaluation/evals.py
+++ b/evaluation/evals.py
@@ -1,10 +1,24 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = "==3.11.11"
+# dependencies = [
+#   "pandas==2.2.3",
+#   "lighteval==0.10.0", 
+#   "openai==1.83.0"
+# ]
+# ///
+
 """
 Evaluate LLM outputs using multiple metrics and compute associated costs
 """
 
-#TODO: Run this script with uv to manage dependencies
+#This script evaluates LLM outputs using the `lighteval` library
+#https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
+
+#This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist
+
 
-# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
+#TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
 
 import sys
 import os
diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py
index 55627cf1..7137f026 100644
--- a/server/api/services/llm_services.py
+++ b/server/api/services/llm_services.py
@@ -17,11 +17,13 @@ def handle_request(
         pass
 
 # LLM Pricing Calculator: https://www.llm-prices.com/
+# TODO: Add support for more models and their pricing
 
- # Anthropic  Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
+# Anthropic  Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
 
 class GPT4OMiniHandler(BaseModelHandler):
     MODEL = "gpt-4o-mini"
+    # TODO: Get the latest model pricing from OpenAI's API or documentation
     # Model Pricing: https://platform.openai.com/docs/pricing
     PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.15, "output": 0.60}
 

From 4f8cbad2803381fbd9c0078b55a5a0ba43ca1fc9 Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Fri, 11 Jul 2025 12:04:13 -0400
Subject: [PATCH 08/15] Update README to clarify the purpose and usage of the 
 script,

---
 evaluation/README.md | 123 ++++++++++++++++++++-----------------------
 1 file changed, 58 insertions(+), 65 deletions(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index 5e95aaab..c06353f4 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -1,63 +1,27 @@
 # Evaluations
 
-## LLM Output Evaluator
+## `evals`: LLM evaluations to test and improve model outputs
 
-The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost
+LLM evals test a prompt with a set of test data by scoring each item in the data set
 
-This script helps teams compare LLM outputs using extractiveness metrics, token usage, and cost. It is especially useful for evaluating multiple models over a batch of queries and reference answers.
+To test Balancer's structured text extraction of medication rules, `evals` computes:
 
-It supports batch evaluation via a configuration CSV and produces a detailed metrics report in CSV format.
+[Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks):
 
-### Usage
+* Extractiveness Coverage: 
+    - Percentage of words in the summary that are part of an extractive fragment with the article
+* Extractiveness Density: 
+    - Average length of the extractive fragment to which each word in the summary belongs
+* Extractiveness Compression: 
+    - Word ratio between the article and the summary
 
-Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments:
-
-```sh
-uv run evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
-```
-
-Execute without using uv run by ensuring it is executable:
-
-```sh
-./evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
-```
-
-The arguments to the script are:
-
-- Path to the config CSV file: Must include the columns "Model Name" and "Query"
-- Path to the reference CSV file: Must include the columns "Context" and "Reference"
-- Path where the evaluation results will be saved
-
-### Configuration File
-
-Generate the config CSV file:
-
-```
-import pandas as pd
-
-# Define the data
-data = [
-
-    {
-      "Model Name": "<MODEL_NAME_1>",
-      "Query": """<YOUR_QUERY_1>"""
-    },
-
-    {
-        "Model Name": "<MODEL_NAME_2>",
-        "Query": """<YOUR_QUERY_2>"""
-    },
-]
-
-# Create DataFrame from records
-df = pd.DataFrame.from_records(data)
-
-# Write to CSV
-df.to_csv("<CONFIG_CSV_PATH>", index=False)
-```
+API usage:
 
+* Token usage (input/output)
+* Estimated cost in USD
+* Duration (in seconds)
 
-### Reference File
+### Test Data: 
 
 Generate the reference file by connecting to a database of references
 
@@ -77,7 +41,10 @@ echo 'export PATH="/Applications/Postgres.app/Contents/Versions/latest/bin:$PATH
 
 createdb <DB_NAME>
 pg_restore -v -d <DB_NAME> <PATH_TO_BACKUP>.sql
+```
 
+```
+from sqlalchemy import create_engine
 engine = create_engine("postgresql://<USER>@localhost:5432/<DB_NAME>")
 ```
 
@@ -99,26 +66,54 @@ df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'
 df_grouped.to_csv('<REFERENCE_CSV_PATH>', index=False)
 ```
 
-### Output File
 
-The script outputs a CSV with the following columns:
+### Running an Evaluation
 
-Extractiveness Metrics based on the methodology from: https://aclanthology.org/N18-1065.pdf
+#### Test Input: Bulk model and prompt experimentation
 
-* Evaluates LLM outputs for:
+Compare the results of many different prompts and models at once
 
-  * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article
-  * Extractiveness Density: Average length of the extractive fragment to which each word in the summary belongs
-  * Extractiveness Compression: Word ratio between the article and the summary
+```
+import pandas as pd
 
-* Computes:
+# Define the data
+data = [
 
-  * Token usage (input/output)
-  * Estimated cost in USD
-  * Duration (in seconds)
+    {
+      "Model Name": "<MODEL_NAME_1>",
+      "Query": """<YOUR_QUERY_1>"""
+    },
 
+    {
+    "Model Name": "<MODEL_NAME_2>",
+    "Query": """<YOUR_QUERY_2>"""
+    },
+]
 
-Exploratory data analysis:
+# Create DataFrame from records
+df = pd.DataFrame.from_records(data)
+
+# Write to CSV
+df.to_csv("<CONFIG_CSV_PATH>", index=False)
+```
+
+
+#### Execute on the command line
+
+
+Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments:
+
+```sh
+uv run evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
+```
+
+Execute without using uv run by ensuring it is executable:
+
+```sh
+./evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
+```
+
+### Analyzing Test Results
 
 ```
 import pandas as pd
@@ -158,6 +153,4 @@ for i, metric in enumerate(all_metrics):
 plt.tight_layout()
 plt.show()
 
-#TODO: Calculate efficiency metrics: Total Token Usage, Cost per Token, Tokens per Second, Cost per Second
-
 ```
\ No newline at end of file

From fe302b5ef5b6c254adcd9b9065e3552563b8403d Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Fri, 11 Jul 2025 12:07:07 -0400
Subject: [PATCH 09/15] ADD TODOs

---
 evaluation/evals.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/evaluation/evals.py b/evaluation/evals.py
index 9e597d3f..e13e79c8 100755
--- a/evaluation/evals.py
+++ b/evaluation/evals.py
@@ -3,7 +3,7 @@
 # requires-python = "==3.11.11"
 # dependencies = [
 #   "pandas==2.2.3",
-#   "lighteval==0.10.0", 
+#   "lighteval==0.10.0",
 #   "openai==1.83.0"
 # ]
 # ///
@@ -12,13 +12,13 @@
 Evaluate LLM outputs using multiple metrics and compute associated costs
 """
 
-#This script evaluates LLM outputs using the `lighteval` library
-#https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
+# This script evaluates LLM outputs using the `lighteval` library
+# https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
 
-#This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist
+# This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist
 
 
-#TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
+# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
 
 import sys
 import os
@@ -40,9 +40,7 @@
 )
 
 
-def evaluate_response(
-    model_name: str, query: str, context: str
-) -> pd.DataFrame:
+def evaluate_response(model_name: str, query: str, context: str) -> pd.DataFrame:
     """
     Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost
 
@@ -91,6 +89,8 @@ def evaluate_response(
 
 
 if __name__ == "__main__":
+    # TODO: Add test evaluation argument to run on the first 10 rows of the config file
+
     # TODO: Add CLI argument to specify the metrics to be computed
     parser = argparse.ArgumentParser(
         description="Evaluate LLM outputs using multiple metrics and compute associated costs"
@@ -149,9 +149,7 @@ def evaluate_response(
         df_evals = pd.concat(
             [
                 df_evals,
-                evaluate_response(
-                    row["Model Name"], row["Query"], row["Context"]
-                ),
+                evaluate_response(row["Model Name"], row["Query"], row["Context"]),
             ],
             axis=0,
         )

From 3c9a1c9127cabc560fde3507c687c4d39e74d576 Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Fri, 11 Jul 2025 15:30:52 -0400
Subject: [PATCH 10/15] Update README for clearer instructions, refactor
 evals.py for better error handling, column validation, and batch processing

---
 evaluation/README.md |  21 ++---
 evaluation/evals.py  | 220 ++++++++++++++++++++++++-------------------
 2 files changed, 133 insertions(+), 108 deletions(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index c06353f4..9e8cfa5d 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -21,9 +21,9 @@ API usage:
 * Estimated cost in USD
 * Duration (in seconds)
 
-### Test Data: 
+### Test Data
 
-Generate the reference file by connecting to a database of references
+Generate the dataset file by connecting to a database of references
 
 Connect to the Postgres database of your local Balancer instance:
 
@@ -48,7 +48,7 @@ from sqlalchemy import create_engine
 engine = create_engine("postgresql://<USER>@localhost:5432/<DB_NAME>")
 ```
 
-Generate the reference CSV file:
+Generate the dataset CSV file:
 
 ```
 import pandas as pd
@@ -63,7 +63,7 @@ df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
 df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index()
 
 df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'})
-df_grouped.to_csv('<REFERENCE_CSV_PATH>', index=False)
+df_grouped.to_csv('<DATASET_CSV_PATH>', index=False)
 ```
 
 
@@ -94,7 +94,7 @@ data = [
 df = pd.DataFrame.from_records(data)
 
 # Write to CSV
-df.to_csv("<CONFIG_CSV_PATH>", index=False)
+df.to_csv("<EXPERIMENTS_CSV_PATH>", index=False)
 ```
 
 
@@ -104,13 +104,13 @@ df.to_csv("<CONFIG_CSV_PATH>", index=False)
 Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments:
 
 ```sh
-uv run evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
+uv run evals.py --experiments path/to/<EXPERIMENTS_CSV> --dataset path/to/<DATASET_CSV> --results path/to/<RESULTS_CSV>
 ```
 
 Execute without using uv run by ensuring it is executable:
 
 ```sh
-./evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
+./evals.py --experiments path/to/<EXPERIMENTS_CSV> --dataset path/to/<DATASET_CSV> --results path/to/<RESULTS_CSV>
 ```
 
 ### Analyzing Test Results
@@ -120,7 +120,7 @@ import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 
-df = pd.read_csv("<OUTPUT_CSV_PATH>")
+df = pd.read_csv("<RESULTS_CSV_PATH>")
 
 # Define the metrics of interest
 extractiveness_cols = ['Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression']
@@ -132,7 +132,7 @@ all_metrics = extractiveness_cols + token_cols + other_metrics
 plt.style.use('default')
 fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics)))
 
-models = df['Model Name'].unique()
+models = df['MODEL'].unique()
 colors = plt.cm.Set3(np.linspace(0, 1, len(models)))
 
 for i, metric in enumerate(all_metrics):
@@ -140,7 +140,7 @@ for i, metric in enumerate(all_metrics):
     
     # Create histogram for each model
     for j, model in enumerate(models):
-        model_data = df[df['Model Name'] == model][metric]
+        model_data = df[df['MODEL'] == model][metric]
         ax.hist(model_data, alpha=0.7, label=model, bins=min(8, len(model_data)), 
                 color=colors[j], edgecolor='black', linewidth=0.5)
     
@@ -152,5 +152,4 @@ for i, metric in enumerate(all_metrics):
 
 plt.tight_layout()
 plt.show()
-
 ```
\ No newline at end of file
diff --git a/evaluation/evals.py b/evaluation/evals.py
index e13e79c8..9c2a30b7 100755
--- a/evaluation/evals.py
+++ b/evaluation/evals.py
@@ -12,14 +12,6 @@
 Evaluate LLM outputs using multiple metrics and compute associated costs
 """
 
-# This script evaluates LLM outputs using the `lighteval` library
-# https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
-
-# This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist
-
-
-# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
-
 import sys
 import os
 
@@ -30,6 +22,8 @@
 import logging
 
 import pandas as pd
+
+# lighteval depends on `sentencepiece` and it only has prebuilt wheels for Python 3.11 or below
 from lighteval.tasks.requests import Doc
 from lighteval.metrics.metrics_sample import Extractiveness
 
@@ -40,129 +34,161 @@
 )
 
 
-def evaluate_response(model_name: str, query: str, context: str) -> pd.DataFrame:
+def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame:
+    """
+    Test a prompt with a set of test data by scoring each item in the data set
     """
-    Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost
 
-    Args:
-        model_name (str): The name of the model to be used for evaluation.
-        query (str): The user query to be processed.
-        context (str): The context or document content to be used.
-        reference (str): The reference text for comparison (not used in this function, but can be used for further evaluations).
+    try:
+        handler = ModelFactory.get_handler(model)
 
-    Returns:
-        pd.DataFrame: A DataFrame containing the output text, extractiveness metrics, token usage, cost, and duration.
-    """
+        generated_text, token_usage, pricing, duration = handler.handle_request(
+            instructions, input
+        )
 
-    handler = ModelFactory.get_handler(model_name)
+        doc = Doc(query="", choices=[], gold_index=0, specific={"text": input})
+        extractiveness = Extractiveness().compute(
+            formatted_doc=doc, predictions=[generated_text]
+        )
 
-    # TODO: Add error handling for unsupported models
+        cost_metrics = calculate_cost_metrics(token_usage, pricing)
 
-    output_text, token_usage, pricing, duration = handler.handle_request(query, context)
+        result = pd.DataFrame(
+            [
+                {
+                    "Generated Text": generated_text,
+                    "Extractiveness Coverage": extractiveness["summarization_coverage"],
+                    "Extractiveness Density": extractiveness["summarization_density"],
+                    "Extractiveness Compression": extractiveness[
+                        "summarization_compression"
+                    ],
+                    "Input Token Usage": token_usage.input_tokens,
+                    "Output Token Usage": token_usage.output_tokens,
+                    "Cost (USD)": cost_metrics["total_cost"],
+                    "Duration (s)": duration,
+                }
+            ]
+        )
 
-    doc = Doc(query="", choices=[], gold_index=0, specific={"text": context})
-    extractiveness = Extractiveness().compute(
-        formatted_doc=doc, predictions=[output_text]
-    )
+    except Exception as e:
+        logging.error(f"Error evaluating response for model {model}: {e}")
+        result = pd.DataFrame(
+            [
+                {
+                    "Generated Text": None,
+                    "Extractiveness Coverage": None,
+                    "Extractiveness Density": None,
+                    "Extractiveness Compression": None,
+                    "Input Token Usage": None,
+                    "Output Token Usage": None,
+                    "Cost (USD)": None,
+                    "Duration (s)": None,
+                }
+            ]
+        )
+
+    return result
+
+
+def calculate_cost_metrics(token_usage: dict, pricing: dict) -> dict:
+    """
+    Calculate cost metrics based on token usage and pricing
+    """
 
-    input_cost_dollars = (pricing["input"] / 1000000) * token_usage.input_tokens
-    output_cost_dollars = (pricing["output"] / 1000000) * token_usage.output_tokens
+    TOKENS_PER_MILLION = 1_000_000
 
+    # Pricing is in dollars per million tokens
+    input_cost_dollars = (
+        pricing["input"] / TOKENS_PER_MILLION
+    ) * token_usage.input_tokens
+    output_cost_dollars = (
+        pricing["output"] / TOKENS_PER_MILLION
+    ) * token_usage.output_tokens
     total_cost_dollars = input_cost_dollars + output_cost_dollars
 
-    return pd.DataFrame(
-        [
-            {
-                "Output Text": output_text,
-                "Extractiveness Coverage": extractiveness["summarization_coverage"],
-                "Extractiveness Density": extractiveness["summarization_density"],
-                "Extractiveness Compression": extractiveness[
-                    "summarization_compression"
-                ],
-                "Input Token Usage": token_usage.input_tokens,
-                "Output Token Usage": token_usage.output_tokens,
-                "Cost (USD)": total_cost_dollars,
-                "Duration (s)": duration,
-            }
-        ]
-    )
+    return {
+        "input_cost": input_cost_dollars,
+        "output_cost": output_cost_dollars,
+        "total_cost": total_cost_dollars,
+    }
 
 
-if __name__ == "__main__":
-    # TODO: Add test evaluation argument to run on the first 10 rows of the config file
+def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
+    """
+    Load a CSV file and validate that it contains the required columns
 
-    # TODO: Add CLI argument to specify the metrics to be computed
-    parser = argparse.ArgumentParser(
-        description="Evaluate LLM outputs using multiple metrics and compute associated costs"
-    )
-    parser.add_argument("--config", "-c", required=True, help="Path to config CSV file")
-    parser.add_argument(
-        "--reference", "-r", required=True, help="Path to reference CSV file"
-    )
-    parser.add_argument("--output", "-o", required=True, help="Path to output CSV file")
+    Args:
+        file_path (str): Path to the CSV file
+        required_columns (list): List of required column names
 
-    args = parser.parse_args()
+    Returns:
+        pd.DataFrame
+    """
+
+    df = pd.read_csv(file_path)
 
-    df_config = pd.read_csv(args.config)
-    logging.info(f"Config DataFrame shape: {df_config.shape}")
-    logging.info(f"Config DataFrame columns: {df_config.columns.tolist()}")
+    # Remove trailing whitespace from column names
+    df.columns = df.columns.str.strip()
 
-    # Remove the trailing whitespace from column names
-    df_config.columns = df_config.columns.str.strip()
+    # Uppercase the column names to match the expected format
+    df.columns = df.columns.str.upper()
 
     # Check if the required columns are present
-    # TODO: Make this more flexible by allowing the user to use default instructions
-    required_columns = ["Model Name", "Query"]
-    if not all(col in df_config.columns for col in required_columns):
+    if not all(col in df.columns for col in required_columns):
         raise ValueError(
-            f"Config DataFrame must contain the following columns: {required_columns}"
+            f"{file_path} must contain the following columns: {required_columns}"
         )
 
-    # Check if all models in the config are supported by ModelFactory
+    return df
+
+
+if __name__ == "__main__":
+    # TODO: Add test evaluation argument to run on the first 10 rows of the dataset file
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--experiments", "-e", required=True, help="Path to experiments CSV file"
+    )
+    parser.add_argument(
+        "--dataset", "-d", required=True, help="Path to dataset CSV file"
+    )
+    parser.add_argument(
+        "--results", "-r", required=True, help="Path to results CSV file"
+    )
+
+    args = parser.parse_args()
+
+    df_experiment = load_csv(
+        args.experiments, required_columns=["MODEL", "INSTRUCTIONS"]
+    )
+    # Check if all models are supported by ModelFactory
     if not all(
         model in ModelFactory.HANDLERS.keys()
-        for model in df_config["Model Name"].unique()
+        for model in df_experiment["MODEL"].unique()
     ):
         raise ValueError(
-            f"Unsupported model(s) found in config: {set(df_config['Model Name'].unique()) - set(ModelFactory.HANDLERS.keys())}"
-        )
-
-    df_reference = pd.read_csv(args.reference)
-    logging.info(f"Reference DataFrame shape: {df_reference.shape}")
-    logging.info(f"Reference DataFrame columns: {df_reference.columns.tolist()}")
-
-    # Remove the trailing whitespace from column names
-    df_reference.columns = df_reference.columns.str.strip()
-    # Check if the required columns are present
-    required_columns = ["Context"]
-    if not all(col in df_reference.columns for col in required_columns):
-        raise ValueError(
-            f"Reference DataFrame must contain the following columns: {required_columns}"
+            f"Unsupported model(s) found: {set(df_experiment['MODEL'].unique()) - set(ModelFactory.HANDLERS.keys())}"
         )
+    df_dataset = load_csv(args.dataset, required_columns=["INPUT"])
 
-    # Cross join the config and reference DataFrames
-    df_in = df_config.merge(df_reference, how="cross")
+    # Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames
+    df_in = df_experiment.merge(df_dataset, how="cross")
 
-    # TODO: Parallelize the evaluation process for each row in df_in using concurrent.futures or similar libraries
-    df_evals = pd.DataFrame()
-    for index, row in df_in.iterrows():
-        df_evals = pd.concat(
-            [
-                df_evals,
-                evaluate_response(row["Model Name"], row["Query"], row["Context"]),
-            ],
-            axis=0,
-        )
+    # Evaluate each row in the input DataFrame
+    results = []
+    for index, row in enumerate(df_in.itertuples(index=False)):
+        result = evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT)
+        results.append(result)
 
+        # TODO: Use tqdm or similar library to show progress bar
         logging.info(f"Processed row {index + 1}/{len(df_in)}")
 
-    # Concatenate the input and evaluations DataFrames
+    df_evals = pd.concat(results, axis=0, ignore_index=True)
 
+    # Concatenate the input and evaluations DataFrames
     df_out = pd.concat(
         [df_in.reset_index(drop=True), df_evals.reset_index(drop=True)], axis=1
     )
-
-    df_out.to_csv(args.output, index=False)
-    logging.info(f"Output DataFrame shape: {df_out.shape}")
-    logging.info(f"Results saved to {args.output}")
+    df_out.to_csv(args.results, index=False)
+    logging.info(f"Results saved to {args.results}")
     logging.info("Evaluation completed successfully.")

From d1dd75c2c9277e612473df049a2e482821d659b0 Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Tue, 15 Jul 2025 10:25:10 -0400
Subject: [PATCH 11/15] Update evaluation README with metrics and API usage
 details, and add 'Contributing' section

---
 evaluation/README.md | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index 9e8cfa5d..48686950 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -2,12 +2,12 @@
 
 ## `evals`: LLM evaluations to test and improve model outputs
 
-LLM evals test a prompt with a set of test data by scoring each item in the data set
-
-To test Balancer's structured text extraction of medication rules, `evals` computes:
+### Metrics
 
 [Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks):
 
+Natural Language Generation Performance:
+
 * Extractiveness Coverage: 
     - Percentage of words in the summary that are part of an extractive fragment with the article
 * Extractiveness Density: 
@@ -15,10 +15,10 @@ To test Balancer's structured text extraction of medication rules, `evals` compu
 * Extractiveness Compression: 
     - Word ratio between the article and the summary
 
-API usage:
+API Performance:
 
-* Token usage (input/output)
-* Estimated cost in USD
+* Token Usage (input/output)
+* Estimated Cost in USD
 * Duration (in seconds)
 
 ### Test Data
@@ -152,4 +152,7 @@ for i, metric in enumerate(all_metrics):
 
 plt.tight_layout()
 plt.show()
-```
\ No newline at end of file
+
+```
+
+### Contributing

From eef2a29d117e40f619a3dcac1f073aeee2555c6f Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Tue, 15 Jul 2025 10:54:50 -0400
Subject: [PATCH 12/15] Update evaluation instructions, improve dataset
 generation section, and clarify external tools

---
 evaluation/README.md | 45 +++++++++++++++++++-------------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index 48686950..ddaf12c9 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -2,12 +2,12 @@
 
 ## `evals`: LLM evaluations to test and improve model outputs
 
-### Metrics
-
-[Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks):
+### Evaluation Metrics
 
 Natural Language Generation Performance:
 
+[Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks):
+
 * Extractiveness Coverage: 
     - Percentage of words in the summary that are part of an extractive fragment with the article
 * Extractiveness Density: 
@@ -23,7 +23,7 @@ API Performance:
 
 ### Test Data
 
-Generate the dataset file by connecting to a database of references
+Generate the dataset file by connecting to a database of research papers:
 
 Connect to the Postgres database of your local Balancer instance:
 
@@ -36,72 +36,63 @@ engine = create_engine("postgresql+psycopg2://balancer:balancer@localhost:5433/b
 Connect to the Postgres database of the production Balancer instance using a SQL file:
 
 ```
-# Install Postgres.app and add binaries to the PATH 
+# Add Postgres.app binaries to the PATH 
 echo 'export PATH="/Applications/Postgres.app/Contents/Versions/latest/bin:$PATH"' >> ~/.zshrc
 
 createdb <DB_NAME>
 pg_restore -v -d <DB_NAME> <PATH_TO_BACKUP>.sql
 ```
 
-```
-from sqlalchemy import create_engine
-engine = create_engine("postgresql://<USER>@localhost:5432/<DB_NAME>")
-```
-
 Generate the dataset CSV file:
 
 ```
+from sqlalchemy import create_engine
 import pandas as pd
 
+engine = create_engine("postgresql://<USER>@localhost:5432/<DB_NAME>")
+
 query = "SELECT * FROM api_embeddings;"
 df = pd.read_sql(query, engine)
 
-df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1)
+df['INPUT'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1)
 
 # Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining
 df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
-df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index()
+df_grouped = df.groupby(['name', 'upload_file_id'])['INPUT'].apply(lambda chunks: "\n".join(chunks)).reset_index()
 
-df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'})
 df_grouped.to_csv('<DATASET_CSV_PATH>', index=False)
 ```
 
-
 ### Running an Evaluation
 
-#### Test Input: Bulk model and prompt experimentation
+#### Bulk Model and Prompt Experimentation
 
 Compare the results of many different prompts and models at once
 
 ```
 import pandas as pd
 
-# Define the data
 data = [
-
     {
-      "Model Name": "<MODEL_NAME_1>",
-      "Query": """<YOUR_QUERY_1>"""
+    "MODEL": "<MODEL_NAME_1>",
+    "INSTRUCTIONS": """<YOUR_QUERY_1>"""
     },
-
     {
-    "Model Name": "<MODEL_NAME_2>",
-    "Query": """<YOUR_QUERY_2>"""
+    "MODEL": "<MODEL_NAME_2>",
+    "INSTRUCTIONS": """<YOUR_QUERY_2>"""
     },
 ]
 
-# Create DataFrame from records
 df = pd.DataFrame.from_records(data)
 
-# Write to CSV
 df.to_csv("<EXPERIMENTS_CSV_PATH>", index=False)
 ```
 
 
-#### Execute on the command line
+#### Execute on the Command Line
 
 
-Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments:
+Execute [using `uv` to manage dependencies](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments:
 
 ```sh
 uv run evals.py --experiments path/to/<EXPERIMENTS_CSV> --dataset path/to/<DATASET_CSV> --results path/to/<RESULTS_CSV>
@@ -156,3 +147,5 @@ plt.show()
 ```
 
 ### Contributing
+
+You're welcome to add LLM models to test in `server/api/services/llm_services`
\ No newline at end of file

From ffa86f7d99f8e1df4dcf2dbedae9a56e7efe20dc Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Tue, 15 Jul 2025 11:57:32 -0400
Subject: [PATCH 13/15] Update dependencies list in  and correct comment syntax
 in

---
 evaluation/evals.py                 |  5 ++++-
 server/api/services/llm_services.py | 28 +++++++++-------------------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/evaluation/evals.py b/evaluation/evals.py
index 9c2a30b7..08eda2bc 100755
--- a/evaluation/evals.py
+++ b/evaluation/evals.py
@@ -4,7 +4,10 @@
 # dependencies = [
 #   "pandas==2.2.3",
 #   "lighteval==0.10.0",
-#   "openai==1.83.0"
+#   "openai==1.83.0",
+#   "spacy==3.8.7",
+#   "pip"
+#
 # ]
 # ///
 
diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py
index 7137f026..18c6e58f 100644
--- a/server/api/services/llm_services.py
+++ b/server/api/services/llm_services.py
@@ -9,6 +9,7 @@
 
 import openai
 
+
 class BaseModelHandler(ABC):
     @abstractmethod
     def handle_request(
@@ -16,11 +17,13 @@ def handle_request(
     ) -> tuple[str, dict[str, int], dict[str, float], float]:
         pass
 
+
 # LLM Pricing Calculator: https://www.llm-prices.com/
 # TODO: Add support for more models and their pricing
 
 # Anthropic  Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
 
+
 class GPT4OMiniHandler(BaseModelHandler):
     MODEL = "gpt-4o-mini"
     # TODO: Get the latest model pricing from OpenAI's API or documentation
@@ -44,10 +47,7 @@ def handle_request(
         start_time = time.time()
         # TODO: Add error handling for API requests and invalid responses
         response = self.client.responses.create(
-            model=self.MODEL,
-            instructions=query,
-            input=context,
-            temperature=0.0
+            model=self.MODEL, instructions=query, input=context, temperature=0.0
         )
         duration = time.time() - start_time
 
@@ -67,7 +67,7 @@ class GPT41NanoHandler(BaseModelHandler):
 
     # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide
 
-    # Long context performance can degrade as more items are required to be retrieved, 
+    # Long context performance can degrade as more items are required to be retrieved,
     # or perform complex reasoning that requires knowledge of the state of the entire context
 
     #
@@ -82,7 +82,7 @@ class GPT41NanoHandler(BaseModelHandler):
 
     # Instructions
 
-    - Identify decision points for bipolar medications #TODO: "pharmacological and procedurl interventions" 
+    - Identify decision points for bipolar medications
 
     - For each decision point you find, return a JSON object using the following format:
 
@@ -92,15 +92,11 @@ class GPT41NanoHandler(BaseModelHandler):
             "medications": ["<medication 1>", "<medication 2>", ...],
             "reason": "<short explanation for why this criterion applies>",
             "sources": ["<ID-X>"]
-            "hierarchy": Primary: Contraindictions for allergies
-            "override" Exclude for allergy
         }
 
 
     - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge
 
-    - TODO: Test against medication indication file 
-
     # Output Format
 
     - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array
@@ -145,15 +141,11 @@ def handle_request(
         if not query:
             query = self.INSTRUCTIONS
 
-
         start_time = time.time()
         # TODO: Add error handling for API requests and invalid responses
 
         response = self.client.responses.create(
-            model=self.MODEL,
-            instructions=query,
-            input=context,
-            temperature=0.0
+            model=self.MODEL, instructions=query, input=context, temperature=0.0
         )
         duration = time.time() - start_time
 
@@ -166,10 +158,8 @@ def handle_request(
 
 
 class ModelFactory:
-
-    #TODO: Define structured fields to extract from unstructured input data
-    #https://platform.openai.com/docs/guides/structured-outputs?api-mode=responses&example=structured-data#examples
-
+    # TODO: Define structured fields to extract from unstructured input data
+    # https://platform.openai.com/docs/guides/structured-outputs?api-mode=responses&example=structured-data#examples
 
     HANDLERS = {
         "GPT_4O_MINI": GPT4OMiniHandler,

From 42a494951a9c83f6f421c74ee70d6c6e7f0e6a36 Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Tue, 15 Jul 2025 20:30:28 -0400
Subject: [PATCH 14/15] Update README.md

---
 evaluation/README.md | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index ddaf12c9..6e1a1cf2 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -9,11 +9,16 @@ Natural Language Generation Performance:
 [Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks):
 
 * Extractiveness Coverage: 
+    - Extent to which a summary is derivative of a text
     - Percentage of words in the summary that are part of an extractive fragment with the article
 * Extractiveness Density: 
+    - How well the word sequence can be described as series of extractions
+    - A summary might contain many individual words from the article and therefore have a high coverage. 
+    - However, if arranged in a new order, the words of the summary could still be used to convey ideas not present in the article
     - Average length of the extractive fragment to which each word in the summary belongs
 * Extractiveness Compression: 
     - Word ratio between the article and the summary
+    - Summarizing with higher compression is challenging as it requires capturing more precisely the critical aspects of the article text.
 
 API Performance:
 
@@ -119,7 +124,7 @@ token_cols = ['Input Token Usage', 'Output Token Usage']
 other_metrics = ['Cost (USD)', 'Duration (s)']
 all_metrics = extractiveness_cols + token_cols + other_metrics
 
-# Metric histograms by model
+# Metric Histograms by Model
 plt.style.use('default')
 fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics)))
 
@@ -144,6 +149,36 @@ for i, metric in enumerate(all_metrics):
 plt.tight_layout()
 plt.show()
 
+# Metric Statistics by Model
+for metric in all_metrics:
+    print(f"\n{metric.upper()}:")
+    desc_stats = df.groupby('MODEL')[metric].agg([
+        'count', 'mean', 'std', 'min', 'median','max'
+    ])
+
+    print(desc_stats)
+
+
+# Calculate Efficiency Metrics By model
+df_analysis = df.copy()
+df_analysis['Total Token Usage'] = df_analysis['Input Token Usage'] + df_analysis['Output Token Usage']
+df_analysis['Cost per Token'] = df_analysis['Cost (USD)'] / df_analysis['Total Token Usage']
+df_analysis['Tokens per Second'] = df_analysis['Total Token Usage'] / df_analysis['Duration (s)']
+df_analysis['Cost per Second'] = df_analysis['Cost (USD)'] / df_analysis['Duration (s)']
+
+efficiency_metrics = ['Cost per Token', 'Tokens per Second', 'Cost per Second']
+
+for metric in efficiency_metrics:
+    print(f"\n{metric.upper()}:")
+    eff_stats = df_analysis.groupby('MODEL')[metric].agg([
+        'count', 'mean', 'std', 'min', 'median', 'max'
+    ])
+
+    for col in ['mean', 'std', 'min', 'median', 'max']:
+        eff_stats[col] = eff_stats[col].apply(lambda x: f"{x:.3g}")
+    print(eff_stats)
+
+
 ```
 
 ### Contributing

From 0e2893b413ba2b318b2e43ff0177c36107cd4c7a Mon Sep 17 00:00:00 2001
From: Sahil D Shah <sahildshah1@gmail.com>
Date: Tue, 15 Jul 2025 20:59:26 -0400
Subject: [PATCH 15/15] Update README.md

---
 evaluation/README.md | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index 6e1a1cf2..669141d8 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -8,17 +8,9 @@ Natural Language Generation Performance:
 
 [Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks):
 
-* Extractiveness Coverage: 
-    - Extent to which a summary is derivative of a text
-    - Percentage of words in the summary that are part of an extractive fragment with the article
-* Extractiveness Density: 
-    - How well the word sequence can be described as series of extractions
-    - A summary might contain many individual words from the article and therefore have a high coverage. 
-    - However, if arranged in a new order, the words of the summary could still be used to convey ideas not present in the article
-    - Average length of the extractive fragment to which each word in the summary belongs
-* Extractiveness Compression: 
-    - Word ratio between the article and the summary
-    - Summarizing with higher compression is challenging as it requires capturing more precisely the critical aspects of the article text.
+* Extractiveness Coverage: Extent to which a summary is derivative of a text
+* Extractiveness Density: How well the word sequence can be described as series of extractions
+* Extractiveness Compression: Word ratio between the article and the summary
 
 API Performance: