CodeForPhilly
diff --git a/‎evaluation/README.md‎
Lines changed: 40 additions & 0 deletions b/‎evaluation/README.md‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎evaluation/evals.py‎
Lines changed: 153 additions & 0 deletions b/‎evaluation/evals.py‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎evaluation/test_evals.py‎
Lines changed: 53 additions & 0 deletions b/‎evaluation/test_evals.py‎
Lines changed: 53 additions & 0 deletions
@@ -0,0 +1,40 @@
+
+# Evaluations
+
+## LLM Output Evaluator
+
+The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost.
+
+It supports batch evalaution via a configuration CSV and produces a detailed metrics report in CSV format.
+
+### Usage
+
+This script evaluates LLM outputs using the `lighteval` library: https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
+
+Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI, Anthropic) configured properly.
+
+
+```bash
+python evals.py --config path/to/config.csv --reference path/to/reference.csv --output path/to/results.csv
+```
+
+The arguments to the script are:
+
+- Path to the config CSV file: Must include the columns "Model Name" and "Query"
+- Path to the reference CSV file: Must include the columns "Context" and "Reference"
+- Path where the evaluation resuls will be saved
+
+
+The script outputs a CSV with the following columns:
+
+* Evaluates LLM outputs for:
+
+  * Extractiveness Coverage
+  * Extractiveness Density
+  * Extractiveness Compression
+
+* Computes:
+
+  * Token usage (input/output)
+  * Estimated cost in USD
+  * Duration (in seconds)
@@ -0,0 +1,153 @@
+"""
+Evaluate LLM outputs using multiple metrics and compute associated costs
+"""
+
+# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
+
+import sys
+import os
+
+# Ensure the parent directory is in the path to import ModelFactory
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+import argparse
+import logging
+
+import pandas as pd
+from lighteval.tasks.requests import Doc
+from lighteval.metrics.metrics_sample import Extractiveness
+
+from server.api.services.llm_services import ModelFactory
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+def evaluate_response(
+    model_name: str, query: str, context: str, reference: str
+) -> pd.DataFrame:
+    """
+    Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost
+
+    Args:
+        model_name (str): The name of the model to be used for evaluation.
+        query (str): The user query to be processed.
+        context (str): The context or document content to be used.
+        reference (str): The reference text for comparison (not used in this function, but can be used for further evaluations).
+
+    Returns:
+        pd.DataFrame: A DataFrame containing the output text, extractiveness metrics, token usage, cost, and duration.
+    """
+
+    handler = ModelFactory.get_handler(model_name)
+
+    # TODO: Add error handling for unsupported models
+
+    output_text, token_usage, pricing, duration = handler.handle_request(query, context)
+
+    doc = Doc(query="", choices=[], gold_index=0, specific={"text": context})
+    extractiveness = Extractiveness().compute(
+        formatted_doc=doc, predictions=[output_text]
+    )
+
+    input_cost_dollars = (pricing["input"] / 1000000) * token_usage.input_tokens
+    output_cost_dollars = (pricing["output"] / 1000000) * token_usage.output_tokens
+
+    total_cost_dollars = input_cost_dollars + output_cost_dollars
+
+    return pd.DataFrame(
+        [
+            {
+                "Output Text": output_text,
+                "Extractiveness Coverage": extractiveness["summarization_coverage"],
+                "Extractiveness Density": extractiveness["summarization_density"],
+                "Extractiveness Compression": extractiveness[
+                    "summarization_compression"
+                ],
+                "Input Token Usage": token_usage.input_tokens,
+                "Output Token Usage": token_usage.output_tokens,
+                "Cost (USD)": total_cost_dollars,
+                "Duration (s)": duration,
+            }
+        ]
+    )
+
+
+if __name__ == "__main__":
+    # TODO: Add CLI argument to specify the metrics to be computed
+    parser = argparse.ArgumentParser(
+        description="Evaluate LLM outputs using multiple metrics and compute associated costs"
+    )
+    parser.add_argument("--config", "-c", required=True, help="Path to config CSV file")
+    parser.add_argument(
+        "--reference", "-r", required=True, help="Path to reference CSV file"
+    )
+    parser.add_argument("--output", "-o", required=True, help="Path to output CSV file")
+
+    args = parser.parse_args()
+
+    df_config = pd.read_csv(args.config)
+    logging.info(f"Config DataFrame shape: {df_config.shape}")
+    logging.info(f"Config DataFrame columns: {df_config.columns.tolist()}")
+
+    # Remove the trailing whitespace from column names
+    df_config.columns = df_config.columns.str.strip()
+
+    # Check if the required columns are present
+    required_columns = ["Model Name", "Query"]
+    if not all(col in df_config.columns for col in required_columns):
+        raise ValueError(
+            f"Config DataFrame must contain the following columns: {required_columns}"
+        )
+
+    # Check if all models in the config are supported by ModelFactory
+    if not all(
+        model in ModelFactory.HANDLERS.keys()
+        for model in df_config["Model Name"].unique()
+    ):
+        raise ValueError(
+            f"Unsupported model(s) found in config: {set(df_config['Model Name'].unique()) - set(ModelFactory.HANDLERS.keys())}"
+        )
+
+    df_reference = pd.read_csv(args.reference)
+    logging.info(f"Reference DataFrame shape: {df_reference.shape}")
+    logging.info(f"Reference DataFrame columns: {df_reference.columns.tolist()}")
+
+    # Remove the trailing whitespace from column names
+    df_reference.columns = df_reference.columns.str.strip()
+    # Check if the required columns are present
+    required_columns = ["Context", "Reference"]
+    if not all(col in df_reference.columns for col in required_columns):
+        raise ValueError(
+            f"Reference DataFrame must contain the following columns: {required_columns}"
+        )
+
+    # Cross join the config and reference DataFrames
+    df_in = df_config.merge(df_reference, how="cross")
+
+    # TODO: Parallelize the evaluation process for each row in df_in using concurrent.futures or similar libraries
+    df_evals = pd.DataFrame()
+    for index, row in df_in.iterrows():
+        df_evals = pd.concat(
+            [
+                df_evals,
+                evaluate_response(
+                    row["Model Name"], row["Query"], row["Context"], row["Reference"]
+                ),
+            ],
+            axis=0,
+        )
+
+        logging.info(f"Processed row {index + 1}/{len(df_in)}")
+
+    # Concatenate the input and evaluations DataFrames
+
+    df_out = pd.concat(
+        [df_in.reset_index(drop=True), df_evals.reset_index(drop=True)], axis=1
+    )
+
+    df_out.to_csv(args.output, index=False)
+    logging.info(f"Output DataFrame shape: {df_out.shape}")
+    logging.info(f"Results saved to {args.output}")
+    logging.info("Evaluation completed successfully.")
@@ -0,0 +1,53 @@
+
+from unittest.mock import patch, MagicMock
+
+import pytest
+import pandas as pd
+
+from evals import evaluate_response  
+
+class MockTokenUsage:
+    def __init__(self, input_tokens, output_tokens):
+        self.input_tokens = input_tokens
+        self.output_tokens = output_tokens
+
+@patch("evals.ModelFactory.get_handler")
+@patch("evals.Extractiveness.compute")
+def test_evaluate_response(mock_extractiveness_compute, mock_get_handler):
+
+    # Mock BaseModelHandler
+    mock_handler = MagicMock()
+    mock_handler.handle_request.return_value = (
+        "This is a summary.",
+        MockTokenUsage(input_tokens=100, output_tokens=50),
+        {"input": 15.0, "output": 30.0},  # $15 and $30 per 1M tokens
+        1.23,  # duration
+    )
+
+    mock_get_handler.return_value = mock_handler
+
+    mock_extractiveness_compute.return_value = {
+        "summarization_coverage": 0.8,
+        "summarization_density": 1.5,
+        "summarization_compression": 2.0,
+    }
+
+    df = evaluate_response(
+        model_name="mock-model",
+        query="What is the summary?",
+        context="This is a long article about something important.",
+        reference="This is a reference summary.",
+    )
+
+    assert isinstance(df, pd.DataFrame)
+    assert df.shape == (1, 8)
+    assert df["Output Text"].iloc[0] == "This is a summary."
+    assert df["Extractiveness Coverage"].iloc[0] == 0.8
+    assert df["Extractiveness Density"].iloc[0] == 1.5
+    assert df["Extractiveness Compression"].iloc[0] == 2.0
+    assert df["Input Token Usage"].iloc[0] == 100
+    assert df["Output Token Usage"].iloc[0] == 50
+
+    expected_cost = (15.0 / 1_000_000) * 100 + (30.0 / 1_000_000) * 50
+    assert pytest.approx(df["Cost (USD)"].iloc[0], rel=1e-4) == expected_cost
+    assert pytest.approx(df["Duration (s)"].iloc[0], rel=1e-4) == 1.23