Skip to content

Commit f170a64

Browse files
committed
feat: add ability for answers to be generated from user questions
When a dataset is provided and is missing the `response` field, we will need to generate these responses. This commit ensures that when this case happens, we will error out when a student model is not configured. Otherwise, we will always generate these responses if the student model exists, regardless if `response` is in the dataframe or not. Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
1 parent df441c1 commit f170a64

File tree

1 file changed

+106
-13
lines changed

1 file changed

+106
-13
lines changed

src/instructlab/eval/ragas.py

Lines changed: 106 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Standard
22
from pathlib import Path
3-
from typing import List, TypedDict
3+
from pydantic import BaseModel, ConfigDict
4+
from typing import List, TypedDict, Optional
45

56
# Third Party
67
from langchain_community.chat_models import ChatOpenAI
@@ -9,23 +10,57 @@
910
DEFAULT_WITH_REFERENCE_RUBRICS,
1011
RubricsScore,
1112
)
12-
import pandas as pd
13+
from pandas import DataFrame, read_json
1314

1415
# Local
1516
from .evaluator import Evaluator
17+
from .mt_bench_common import get_openai_client
1618

1719

1820
class Sample(TypedDict):
21+
"""
22+
TypedDict of a sample that we accept when doing eval with Ragas.
23+
We specifically use TypedDict here to be flexible with the input data we accept.
24+
"""
25+
1926
# question
2027
user_input: str
2128

2229
# model answer
23-
response: str
30+
response: Optional[str]
2431

2532
# golden answer
2633
reference: str
2734

2835

36+
# default system prompt we'll use when none is provided. Make it private as we don't intend this to be a public object
37+
_DEFAULT_SYSTEM_PROMPT = """You are an advanced AI assistant designed to provide precise and accurate information.
38+
Your primary goal is to answer queries with the most up-to-date and factual information available.
39+
Focus on delivering clear, concise, and correct responses.
40+
If you're uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.
41+
Your responses should prioritize accuracy over all other considerations."""
42+
43+
44+
class ModelConfig(BaseModel):
45+
model_config = ConfigDict(protected_namespaces=())
46+
47+
# URL of the OpenAI server where the model shall be hosted
48+
base_url: str
49+
50+
# name of the model to use
51+
model_name: str
52+
system_prompt: str = _DEFAULT_SYSTEM_PROMPT
53+
54+
# We do NOT read from OPENAI_API_KEY for the student model for security reasons (e.g. sending the API key to another client)
55+
# To provide an OpenAI key, you must set it here; else the default is used.
56+
api_key: str = "no-api-key"
57+
58+
# "model randomness" aka likelihood of sampling something other than the likeliest token
59+
temperature: float = 0.0
60+
61+
max_tokens: int = 768
62+
63+
2964
class RagasEvaluator(Evaluator):
3065
# most basic implementation, we just assume that the user will bring the existing model responses
3166
name = "ragas"
@@ -34,14 +69,24 @@ def __init__(self):
3469
pass
3570

3671
def run(
37-
self, dataset: List[Sample] | Path = None, run_config: RunConfig | None = None
72+
self,
73+
dataset: List[Sample] | Path,
74+
student_model: ModelConfig | None = None,
75+
run_config: RunConfig = None,
3876
) -> EvaluationResult:
3977
"""
4078
Evaluates the quality of model responses against a graded rubric.
4179
80+
When the `dataset` lacks the `response` field, then `student_model` must be provided
81+
in order to generate the answers.
82+
4283
Args:
4384
dataset (List[Sample] | Path):
44-
List of model questions and answers
85+
Can be either a list of `Sample` objects or a path to a jsonl file containing
86+
records matching `Sample`.
87+
student_model: (StudentModelConfig):
88+
When this parameter is provided, we'll attempt to use the described model in order to
89+
generate the responses from the given list of questions.
4590
run_config (RunConfig | None, optional):
4691
Configuration to use when running evaluations. If none is provided, then
4792
a default one is created containing extremely permissive settings when handling
@@ -55,14 +100,29 @@ def run(
55100
raise ValueError(
56101
"no dataset was provided, please specify the `dataset` argument"
57102
)
58-
if isinstance(dataset, Path):
59-
input_ds = EvaluationDataset.from_pandas(
60-
pd.read_json(dataset, lines=True, orient="records")
103+
104+
if type(dataset) not in (list, Path):
105+
raise TypeError(f"invalid type of dataset: {type(dataset)}")
106+
107+
# ensure we are in the dataframe format
108+
input_df = None
109+
if isinstance(dataset, list):
110+
input_df = DataFrame(dataset)
111+
elif isinstance(dataset, Path):
112+
input_df = read_json(dataset, orient="records", lines=True)
113+
114+
# this should never happen, but pylint is not smart enough to detect it
115+
assert input_df is not None
116+
117+
need_to_generate_questions = "response" not in input_df.columns
118+
if need_to_generate_questions and not student_model:
119+
raise ValueError(
120+
"provided dataset doesn't contain the model `response`, but no `student_model` was provided for inference"
61121
)
62-
elif isinstance(dataset, list):
63-
input_ds = EvaluationDataset.from_list(dataset)
64-
else:
65-
raise TypeError(f"invalid type passed for dataset: {type(dataset)}")
122+
123+
# if the student model was provided then we always generate regardless
124+
if student_model:
125+
input_df = self._generate_answers_from_model(input_df, student_model)
66126

67127
if not run_config:
68128
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
@@ -81,15 +141,48 @@ def run(
81141
)
82142
]
83143

144+
evaluation_ds = EvaluationDataset.from_pandas(input_df)
145+
84146
# we will be using gpt-4o for the foreseeable future, we hardcode this
85147
# for consistency of answers
86148
critic_lm = ChatOpenAI(model="gpt-4o")
87149
results = evaluate(
88-
dataset=input_ds,
150+
dataset=evaluation_ds,
89151
batch_size=4,
90152
run_config=run_config,
91153
llm=critic_lm,
92154
metrics=metrics,
93155
show_progress=True,
94156
)
95157
return results
158+
159+
def _generate_answers_from_model(
160+
self, questions: DataFrame, student_model: ModelConfig
161+
) -> DataFrame:
162+
"""
163+
Given a DataFrame containing `user_input` columns, generates responses from the given model
164+
and returns a new DataFrame containing its answers in the `response` column.
165+
"""
166+
client = get_openai_client(
167+
model_api_base=student_model.base_url, api_key=student_model.api_key
168+
)
169+
170+
# initialize response to write into
171+
updated_df = questions.copy()
172+
updated_df["response"] = ""
173+
174+
for i, qna in updated_df.iterrows():
175+
messages = [
176+
student_model.system_prompt,
177+
qna["user_input"],
178+
]
179+
response = client.chat.completions.create(
180+
messages=messages,
181+
model=student_model.model_name,
182+
# specify the seed so we can at least try to have some reproducibility when the clients support it
183+
seed=42,
184+
max_tokens=student_model.max_tokens,
185+
temperature=student_model.temperature,
186+
)
187+
updated_df.at[i, "response"] = response.choices[0].message.content
188+
return updated_df

0 commit comments

Comments
 (0)