Update README with detailed usage instructions and enhance evals.py to include environment setup and dependencies

sahilds1 · sahilds1 · commit c03d990a21fd · 2025-07-09T16:27:17.000-04:00
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -1,31 +1,36 @@
-
 # Evaluations
 
-#TODO: Open AI evals documentaiton: https://platform.openai.com/docs/guides/evals
-
 ## LLM Output Evaluator
 
-The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost.
+The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost
 
-It supports batch evalaution via a configuration CSV and produces a detailed metrics report in CSV format.
+This script helps teams compare LLM outputs using extractiveness metrics, token usage, and cost. It is especially useful for evaluating multiple models over a batch of queries and reference answers.
 
-### Usage
+It supports batch evaluation via a configuration CSV and produces a detailed metrics report in CSV format.
 
-This script evaluates LLM outputs using the `lighteval` library:
-https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
+### Usage
 
-##TODO: Use uv to execute scripts without manually manging enviornments https://docs.astral.sh/uv/guides/scripts/
+Execute [using `uv` to manage depenendices](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments:
 
-Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI) configured properly.
+```sh
+uv run evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
+```
 
+Execute without using uv run by ensuring it is executable:
 
-```bash
-python evals.py --config path/to/config.csv --reference path/to/reference.csv --output path/to/results.csv
+```sh
+./evals.py --config path/to/<CONFIG_CSV> --reference path/to/<REFERENCE_CSV> --output path/to/<OUTPUT_CSV>
 ```
 
 The arguments to the script are:
 
 - Path to the config CSV file: Must include the columns "Model Name" and "Query"
+- Path to the reference CSV file: Must include the columns "Context" and "Reference"
+- Path where the evaluation results will be saved
+
+### Configuration File
+
+Generate the config CSV file:
 
 ```
 import pandas as pd
@@ -34,153 +39,93 @@ import pandas as pd
 data = [
 
     {
-      "Model Name": "GPT_4O_MINI",
-      "Query": """
-      You're analyzing medical text from multiple sources. Each chunk is labeled [chunk-X].
-
-      Act as a seasoned physician or medical professional who treats patients with bipolar disorder.
-
-      Identify rules for medication inclusion or exclusion based on medical history or concerns.
-
-      For each rule you find, return a JSON object using the following format:
-
-      {
-        "rule": "<condition or concern>",
-        "type": "INCLUDE" or "EXCLUDE",
-        "reason": "<short explanation for why this rule applies>",
-        "medications": ["<medication 1>", "<medication 2>", ...],
-        "source": "<chunk-X>"
-      }
-
-      Only include rules that are explicitly stated or strongly implied in the chunk.
-
-      Only use the chunks provided. If no rule is found in a chunk, skip it.
-
-      Return the entire output as a JSON array.
-      """
+      "Model Name": "<MODEL_NAME_1>",
+      "Query": """<YOUR_QUERY_1>"""
     },
 
     {
-        "Model Name": "GPT_41_NANO",
-        "Query": """
-        
-    # Role and Objective
-    
-    - You are a seasoned physician or medical professional who is developing a bipolar disorder treatment algorithim
-
-    - You are extracting bipolar medication decision points from a research paper that is chunked into multiple parts each labeled with an ID
-
-    # Instructions
-
-    - Identify decision points for bipolar medications
-
-    - For each decision point you find, return a JSON object using the following format:
-
-        {
-            "criterion": "<condition or concern>",
-            "decision": "INCLUDE" or "EXCLUDE",
-            "medications": ["<medication 1>", "<medication 2>", ...],
-            "reason": "<short explanation for why this criterion applies>",
-            "sources": ["<ID-X>"]
-        }
-
-
-    - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge
-
-    # Output Format
-
-    - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array
-
-    # Example
-
-    [
-        {
-            "criterion": "History of suicide attempts",
-            "decision": "INCLUDE",
-            "medications": ["Lithium"],
-            "reason": "Lithium is the only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder",
-            "sources": ["ID-0"]
-        },
-        {
-            "criterion": "Weight gain concerns",
-            "decision": "EXCLUDE",
-            "medications": ["Quetiapine", "Aripiprazole", "Olanzapine", "Risperidone"],
-            "reason": "Seroquel, Risperdal, Abilify, and Zyprexa are known for causing weight gain",
-            "sources": ["ID-0", "ID-1", "ID-2"]
-        }
-    ]
-
-    """
-        
+        "Model Name": "<MODEL_NAME_2>",
+        "Query": """<YOUR_QUERY_2>"""
     },
 ]
 
 # Create DataFrame from records
 df = pd.DataFrame.from_records(data)
 
 # Write to CSV
-df.to_csv("~/Desktop/evals_config.csv", index=False)
+df.to_csv("<CONFIG_CSV_PATH>", index=False)
 ```
 
 
-- Path to the reference CSV file: Must include the columns "Context" and "Reference"
+### Reference File
+
+Generate the reference file by connecting to a database of references
+
+Connect to the Postgres database of your local Balancer instance:
 
 ```
 from sqlalchemy import create_engine
-import pandas as pd
 
 engine = create_engine("postgresql+psycopg2://balancer:balancer@localhost:5433/balancer_dev")
-# Filter out papers that shouldn't be used from local database
-query = "SELECT * FROM api_embeddings WHERE date_of_upload > '2025-03-14';"
-df = pd.read_sql(query, engine)
-
-df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1)
-# Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining
-df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
-df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index()
-df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'})
-df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False)
 ```
 
+Connect to the Postgres database of the production Balancer instance using a SQL file:
+
 ```
+# Install Postgres.app and add binaries to the PATH 
 echo 'export PATH="/Applications/Postgres.app/Contents/Versions/latest/bin:$PATH"' >> ~/.zshrc
-source ~/.zshrc
 
-createdb backupDBBalancer07012025
-pg_restore -v -d backupDBBalancer07012025 ~/Downloads/backupDBBalancer07012025.sql
+createdb <DB_NAME>
+pg_restore -v -d <DB_NAME> <PATH_TO_BACKUP>.sql
 
-pip install psycopg2-binary
+engine = create_engine("postgresql://<USER>@localhost:5432/<DB_NAME>")
+```
 
-from sqlalchemy import create_engine
-import pandas as pd
+Generate the reference CSV file:
 
-# Alternative: Standard psycopg2 connection (if you get psycopg2 working)
-# engine = create_engine("postgresql://sahildshah@localhost:5432/backupDBBalancer07012025")
+```
+import pandas as pd
 
-# Fixed the variable name (was "database query", now "query")
 query = "SELECT * FROM api_embeddings;"
-
-# Execute the query and load into DataFrame
 df = pd.read_sql(query, engine)
 
 df['formatted_chunk'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1)
+
 # Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining
 df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
 df_grouped = df.groupby(['name', 'upload_file_id'])['formatted_chunk'].apply(lambda chunks: "\n".join(chunks)).reset_index()
+
 df_grouped = df_grouped.rename(columns={'formatted_chunk': 'concatenated_chunks'})
-df_grouped.to_csv('~/Desktop/formatted_chunks.csv', index=False)
+df_grouped.to_csv('<REFERENCE_CSV_PATH>', index=False)
 ```
 
+### Output File
+
+The script outputs a CSV with the following columns:
+
+Extractiveness Metrics based on the methodology from: https://aclanthology.org/N18-1065.pdf
+
+* Evaluates LLM outputs for:
+
+  * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article
+  * Extractiveness Density: Average length of the extractive fragment to which each word in the summary belongs
+  * Extractiveness Compression: Word ratio between the article and the summary
+
+* Computes:
 
+  * Token usage (input/output)
+  * Estimated cost in USD
+  * Duration (in seconds)
 
-- Path where the evaluation resuls will be saved
 
+Exploratory data analysis:
+
+```
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 
-
-df = pd.read_csv("~/Desktop/evals_out-20250702.csv")
+df = pd.read_csv("<OUTPUT_CSV_PATH>")
 
 # Define the metrics of interest
 extractiveness_cols = ['Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression']
@@ -213,22 +158,6 @@ for i, metric in enumerate(all_metrics):
 plt.tight_layout()
 plt.show()
 
-#TODO: Compute count, min, quantiles and max by model
 #TODO: Calculate efficiency metrics: Total Token Usage, Cost per Token, Tokens per Second, Cost per Second
 
-
-The script outputs a CSV with the following columns:
-
-#TODO: Summarize https://aclanthology.org/N18-1065.pdf
-
-* Evaluates LLM outputs for:
-
-  * Extractiveness Coverage: Percentage of words in the summary that are part of an extractive fragment with the article
-  * Extractiveness Density: Average length of the extractive fragement to which each word in the summary belongs
-  * Extractiveness Compression: Word ratio between the article and the summary
-
-* Computes:
-
-  * Token usage (input/output)
-  * Estimated cost in USD
-  * Duration (in seconds)
+```
diff --git a/evaluation/evals.py b/evaluation/evals.py
@@ -1,10 +1,24 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = "==3.11.11"
+# dependencies = [
+#   "pandas==2.2.3",
+#   "lighteval==0.10.0", 
+#   "openai==1.83.0"
+# ]
+# ///
+
 """
 Evaluate LLM outputs using multiple metrics and compute associated costs
 """
 
-#TODO: Run this script with uv to manage dependencies
+#This script evaluates LLM outputs using the `lighteval` library
+#https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
+
+#This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist
+
 
-# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
+#TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
 
 import sys
 import os
diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py
@@ -17,11 +17,13 @@ def handle_request(
         pass
 
 # LLM Pricing Calculator: https://www.llm-prices.com/
+# TODO: Add support for more models and their pricing
 
- # Anthropic  Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
+# Anthropic  Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
 
 class GPT4OMiniHandler(BaseModelHandler):
     MODEL = "gpt-4o-mini"
+    # TODO: Get the latest model pricing from OpenAI's API or documentation
     # Model Pricing: https://platform.openai.com/docs/pricing
     PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.15, "output": 0.60}