feat: add ability for ragas to read from a list

RobotSail · RobotSail · commit 8568b139df06 · 2025-01-07T19:57:21.000-05:00
We want ragas to read from both a list as well as a list of samples

Signed-off-by: Oleg S &lt;97077423+RobotSail@users.noreply.github.com&gt;
diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
@@ -1,11 +1,15 @@
 # Standard
+from pathlib import Path
 from typing import List, TypedDict
 
 # Third Party
 from langchain_community.chat_models import ChatOpenAI
 from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
-from ragas.metrics import RubricsScore
-from ragas.metrics._domain_specific_rubrics import DEFAULT_WITH_REFERENCE_RUBRICS
+from ragas.metrics._domain_specific_rubrics import (  # the rubrics we must instantiate are located inside of a file marked as private
+    DEFAULT_WITH_REFERENCE_RUBRICS,
+    RubricsScore,
+)
+import pandas as pd
 
 # Local
 from .evaluator import Evaluator
@@ -30,13 +34,13 @@ def __init__(self):
         pass
 
     def run(
-        self, dataset: List[Sample], run_config: RunConfig | None = None
+        self, dataset: List[Sample] | Path = None, run_config: RunConfig | None = None
     ) -> EvaluationResult:
         """
         Evaluates the quality of model responses against a graded rubric.
 
         Args:
-            dataset (List[Sample]):
+            dataset (List[Sample] | Path):
                 List of model questions and answers
             run_config (RunConfig | None, optional):
                 Configuration to use when running evaluations. If none is provided, then
@@ -47,6 +51,19 @@ def run(
         Returns:
             EvaluationResult: The results of all evaluations performed by Ragas
         """
+        if not dataset:
+            raise ValueError(
+                "no dataset was provided, please specify the `dataset` argument"
+            )
+        if isinstance(dataset, Path):
+            input_ds = EvaluationDataset.from_pandas(
+                pd.read_json(dataset, lines=True, orient="records")
+            )
+        elif isinstance(dataset, list):
+            input_ds = EvaluationDataset.from_list(dataset)
+        else:
+            raise TypeError(f"invalid type passed for dataset: {type(dataset)}")
+
         if not run_config:
             # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
             # are horrible and will result in half of our evaluation results being NaN or 0
@@ -57,17 +74,15 @@ def run(
                 timeout=3600,
             )
 
-        # we will be using gpt-4o for the foreseeable future, we hardcode this
-        # for consistency of answers
-        input_ds = EvaluationDataset.from_list(dataset)
-
         # default set of metrics
         metrics = [
             RubricsScore(
                 rubrics=DEFAULT_WITH_REFERENCE_RUBRICS,
             )
         ]
 
+        # we will be using gpt-4o for the foreseeable future, we hardcode this
+        # for consistency of answers
         critic_lm = ChatOpenAI(model="gpt-4o")
         results = evaluate(
             dataset=input_ds,