feat: add ability for answers to be generated from user questions

RobotSail · RobotSail · commit f170a64ed33c · 2024-12-06T22:20:58.000-05:00
When a dataset is provided and is missing the `response` field, we will need to generate these responses. This commit ensures that when this case happens, we will error out when a student model is not configured. Otherwise, we will always generate these responses if the student model exists, regardless if `response` is in the dataframe or not.

Signed-off-by: Oleg S &lt;97077423+RobotSail@users.noreply.github.com&gt;
diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
@@ -1,6 +1,7 @@
 # Standard
 from pathlib import Path
-from typing import List, TypedDict
+from pydantic import BaseModel, ConfigDict
+from typing import List, TypedDict, Optional
 
 # Third Party
 from langchain_community.chat_models import ChatOpenAI
@@ -9,23 +10,57 @@
     DEFAULT_WITH_REFERENCE_RUBRICS,
     RubricsScore,
 )
-import pandas as pd
+from pandas import DataFrame, read_json
 
 # Local
 from .evaluator import Evaluator
+from .mt_bench_common import get_openai_client
 
 
 class Sample(TypedDict):
+    """
+    TypedDict of a sample that we accept when doing eval with Ragas.
+    We specifically use TypedDict here to be flexible with the input data we accept.
+    """
+
     # question
     user_input: str
 
     # model answer
-    response: str
+    response: Optional[str]
 
     # golden answer
     reference: str
 
 
+# default system prompt we'll use when none is provided. Make it private as we don't intend this to be a public object
+_DEFAULT_SYSTEM_PROMPT = """You are an advanced AI assistant designed to provide precise and accurate information.
+Your primary goal is to answer queries with the most up-to-date and factual information available.
+Focus on delivering clear, concise, and correct responses.
+If you're uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.
+Your responses should prioritize accuracy over all other considerations."""
+
+
+class ModelConfig(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+
+    # URL of the OpenAI server where the model shall be hosted
+    base_url: str
+
+    # name of the model to use
+    model_name: str
+    system_prompt: str = _DEFAULT_SYSTEM_PROMPT
+
+    # We do NOT read from OPENAI_API_KEY for the student model for security reasons (e.g. sending the API key to another client)
+    # To provide an OpenAI key, you must set it here; else the default is used.
+    api_key: str = "no-api-key"
+
+    # "model randomness" aka likelihood of sampling something other than the likeliest token
+    temperature: float = 0.0
+
+    max_tokens: int = 768
+
+
 class RagasEvaluator(Evaluator):
     # most basic implementation, we just assume that the user will bring the existing model responses
     name = "ragas"
@@ -34,14 +69,24 @@ def __init__(self):
         pass
 
     def run(
-        self, dataset: List[Sample] | Path = None, run_config: RunConfig | None = None
+        self,
+        dataset: List[Sample] | Path,
+        student_model: ModelConfig | None = None,
+        run_config: RunConfig = None,
     ) -> EvaluationResult:
         """
         Evaluates the quality of model responses against a graded rubric.
 
+        When the `dataset` lacks the `response` field, then `student_model` must be provided
+        in order to generate the answers.
+
         Args:
             dataset (List[Sample] | Path):
-                List of model questions and answers
+                Can be either a list of `Sample` objects or a path to a jsonl file containing
+                records matching `Sample`.
+            student_model: (StudentModelConfig):
+                When this parameter is provided, we'll attempt to use the described model in order to
+                generate the responses from the given list of questions.
             run_config (RunConfig | None, optional):
                 Configuration to use when running evaluations. If none is provided, then
                 a default one is created containing extremely permissive settings when handling
@@ -55,14 +100,29 @@ def run(
             raise ValueError(
                 "no dataset was provided, please specify the `dataset` argument"
             )
-        if isinstance(dataset, Path):
-            input_ds = EvaluationDataset.from_pandas(
-                pd.read_json(dataset, lines=True, orient="records")
+
+        if type(dataset) not in (list, Path):
+            raise TypeError(f"invalid type of dataset: {type(dataset)}")
+
+        # ensure we are in the dataframe format
+        input_df = None
+        if isinstance(dataset, list):
+            input_df = DataFrame(dataset)
+        elif isinstance(dataset, Path):
+            input_df = read_json(dataset, orient="records", lines=True)
+
+        # this should never happen, but pylint is not smart enough to detect it
+        assert input_df is not None
+
+        need_to_generate_questions = "response" not in input_df.columns
+        if need_to_generate_questions and not student_model:
+            raise ValueError(
+                "provided dataset doesn't contain the model `response`, but no `student_model` was provided for inference"
             )
-        elif isinstance(dataset, list):
-            input_ds = EvaluationDataset.from_list(dataset)
-        else:
-            raise TypeError(f"invalid type passed for dataset: {type(dataset)}")
+
+        # if the student model was provided then we always generate regardless
+        if student_model:
+            input_df = self._generate_answers_from_model(input_df, student_model)
 
         if not run_config:
             # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
@@ -81,15 +141,48 @@ def run(
             )
         ]
 
+        evaluation_ds = EvaluationDataset.from_pandas(input_df)
+
         # we will be using gpt-4o for the foreseeable future, we hardcode this
         # for consistency of answers
         critic_lm = ChatOpenAI(model="gpt-4o")
         results = evaluate(
-            dataset=input_ds,
+            dataset=evaluation_ds,
             batch_size=4,
             run_config=run_config,
             llm=critic_lm,
             metrics=metrics,
             show_progress=True,
         )
         return results
+
+    def _generate_answers_from_model(
+        self, questions: DataFrame, student_model: ModelConfig
+    ) -> DataFrame:
+        """
+        Given a DataFrame containing `user_input` columns, generates responses from the given model
+        and returns a new DataFrame containing its answers in the `response` column.
+        """
+        client = get_openai_client(
+            model_api_base=student_model.base_url, api_key=student_model.api_key
+        )
+
+        # initialize response to write into
+        updated_df = questions.copy()
+        updated_df["response"] = ""
+
+        for i, qna in updated_df.iterrows():
+            messages = [
+                student_model.system_prompt,
+                qna["user_input"],
+            ]
+            response = client.chat.completions.create(
+                messages=messages,
+                model=student_model.model_name,
+                # specify the seed so we can at least try to have some reproducibility when the clients support it
+                seed=42,
+                max_tokens=student_model.max_tokens,
+                temperature=student_model.temperature,
+            )
+            updated_df.at[i, "response"] = response.choices[0].message.content
+        return updated_df