11# Standard
22from pathlib import Path
3- from typing import List , TypedDict
3+ from pydantic import BaseModel , ConfigDict
4+ from typing import List , TypedDict , Optional
45
56# Third Party
67from langchain_community .chat_models import ChatOpenAI
910 DEFAULT_WITH_REFERENCE_RUBRICS ,
1011 RubricsScore ,
1112)
12- import pandas as pd
13+ from pandas import DataFrame , read_json
1314
1415# Local
1516from .evaluator import Evaluator
17+ from .mt_bench_common import get_openai_client
1618
1719
1820class Sample (TypedDict ):
21+ """
22+ TypedDict of a sample that we accept when doing eval with Ragas.
23+ We specifically use TypedDict here to be flexible with the input data we accept.
24+ """
25+
1926 # question
2027 user_input : str
2128
2229 # model answer
23- response : str
30+ response : Optional [ str ]
2431
2532 # golden answer
2633 reference : str
2734
2835
36+ # default system prompt we'll use when none is provided. Make it private as we don't intend this to be a public object
37+ _DEFAULT_SYSTEM_PROMPT = """You are an advanced AI assistant designed to provide precise and accurate information.
38+ Your primary goal is to answer queries with the most up-to-date and factual information available.
39+ Focus on delivering clear, concise, and correct responses.
40+ If you're uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.
41+ Your responses should prioritize accuracy over all other considerations."""
42+
43+
44+ class ModelConfig (BaseModel ):
45+ model_config = ConfigDict (protected_namespaces = ())
46+
47+ # URL of the OpenAI server where the model shall be hosted
48+ base_url : str
49+
50+ # name of the model to use
51+ model_name : str
52+ system_prompt : str = _DEFAULT_SYSTEM_PROMPT
53+
54+ # We do NOT read from OPENAI_API_KEY for the student model for security reasons (e.g. sending the API key to another client)
55+ # To provide an OpenAI key, you must set it here; else the default is used.
56+ api_key : str = "no-api-key"
57+
58+ # "model randomness" aka likelihood of sampling something other than the likeliest token
59+ temperature : float = 0.0
60+
61+ max_tokens : int = 768
62+
63+
2964class RagasEvaluator (Evaluator ):
3065 # most basic implementation, we just assume that the user will bring the existing model responses
3166 name = "ragas"
@@ -34,14 +69,24 @@ def __init__(self):
3469 pass
3570
3671 def run (
37- self , dataset : List [Sample ] | Path = None , run_config : RunConfig | None = None
72+ self ,
73+ dataset : List [Sample ] | Path ,
74+ student_model : ModelConfig | None = None ,
75+ run_config : RunConfig = None ,
3876 ) -> EvaluationResult :
3977 """
4078 Evaluates the quality of model responses against a graded rubric.
4179
80+ When the `dataset` lacks the `response` field, then `student_model` must be provided
81+ in order to generate the answers.
82+
4283 Args:
4384 dataset (List[Sample] | Path):
44- List of model questions and answers
85+ Can be either a list of `Sample` objects or a path to a jsonl file containing
86+ records matching `Sample`.
87+ student_model: (StudentModelConfig):
88+ When this parameter is provided, we'll attempt to use the described model in order to
89+ generate the responses from the given list of questions.
4590 run_config (RunConfig | None, optional):
4691 Configuration to use when running evaluations. If none is provided, then
4792 a default one is created containing extremely permissive settings when handling
@@ -55,14 +100,29 @@ def run(
55100 raise ValueError (
56101 "no dataset was provided, please specify the `dataset` argument"
57102 )
58- if isinstance (dataset , Path ):
59- input_ds = EvaluationDataset .from_pandas (
60- pd .read_json (dataset , lines = True , orient = "records" )
103+
104+ if type (dataset ) not in (list , Path ):
105+ raise TypeError (f"invalid type of dataset: { type (dataset )} " )
106+
107+ # ensure we are in the dataframe format
108+ input_df = None
109+ if isinstance (dataset , list ):
110+ input_df = DataFrame (dataset )
111+ elif isinstance (dataset , Path ):
112+ input_df = read_json (dataset , orient = "records" , lines = True )
113+
114+ # this should never happen, but pylint is not smart enough to detect it
115+ assert input_df is not None
116+
117+ need_to_generate_questions = "response" not in input_df .columns
118+ if need_to_generate_questions and not student_model :
119+ raise ValueError (
120+ "provided dataset doesn't contain the model `response`, but no `student_model` was provided for inference"
61121 )
62- elif isinstance ( dataset , list ):
63- input_ds = EvaluationDataset . from_list ( dataset )
64- else :
65- raise TypeError ( f"invalid type passed for dataset: { type ( dataset ) } " )
122+
123+ # if the student model was provided then we always generate regardless
124+ if student_model :
125+ input_df = self . _generate_answers_from_model ( input_df , student_model )
66126
67127 if not run_config :
68128 # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
@@ -81,15 +141,48 @@ def run(
81141 )
82142 ]
83143
144+ evaluation_ds = EvaluationDataset .from_pandas (input_df )
145+
84146 # we will be using gpt-4o for the foreseeable future, we hardcode this
85147 # for consistency of answers
86148 critic_lm = ChatOpenAI (model = "gpt-4o" )
87149 results = evaluate (
88- dataset = input_ds ,
150+ dataset = evaluation_ds ,
89151 batch_size = 4 ,
90152 run_config = run_config ,
91153 llm = critic_lm ,
92154 metrics = metrics ,
93155 show_progress = True ,
94156 )
95157 return results
158+
159+ def _generate_answers_from_model (
160+ self , questions : DataFrame , student_model : ModelConfig
161+ ) -> DataFrame :
162+ """
163+ Given a DataFrame containing `user_input` columns, generates responses from the given model
164+ and returns a new DataFrame containing its answers in the `response` column.
165+ """
166+ client = get_openai_client (
167+ model_api_base = student_model .base_url , api_key = student_model .api_key
168+ )
169+
170+ # initialize response to write into
171+ updated_df = questions .copy ()
172+ updated_df ["response" ] = ""
173+
174+ for i , qna in updated_df .iterrows ():
175+ messages = [
176+ student_model .system_prompt ,
177+ qna ["user_input" ],
178+ ]
179+ response = client .chat .completions .create (
180+ messages = messages ,
181+ model = student_model .model_name ,
182+ # specify the seed so we can at least try to have some reproducibility when the clients support it
183+ seed = 42 ,
184+ max_tokens = student_model .max_tokens ,
185+ temperature = student_model .temperature ,
186+ )
187+ updated_df .at [i , "response" ] = response .choices [0 ].message .content
188+ return updated_df
0 commit comments