11# Standard
2+ from pathlib import Path
23from typing import List , TypedDict
34
45# Third Party
56from langchain_community .chat_models import ChatOpenAI
67from ragas .evaluation import EvaluationDataset , EvaluationResult , RunConfig , evaluate
7- from ragas .metrics import RubricsScore
8- from ragas .metrics ._domain_specific_rubrics import DEFAULT_WITH_REFERENCE_RUBRICS
8+ from ragas .metrics ._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private
9+ DEFAULT_WITH_REFERENCE_RUBRICS ,
10+ RubricsScore ,
11+ )
12+ import pandas as pd
913
1014# Local
1115from .evaluator import Evaluator
@@ -30,13 +34,13 @@ def __init__(self):
3034 pass
3135
3236 def run (
33- self , dataset : List [Sample ], run_config : RunConfig | None = None
37+ self , dataset : List [Sample ] | Path = None , run_config : RunConfig | None = None
3438 ) -> EvaluationResult :
3539 """
3640 Evaluates the quality of model responses against a graded rubric.
3741
3842 Args:
39- dataset (List[Sample]):
43+ dataset (List[Sample] | Path ):
4044 List of model questions and answers
4145 run_config (RunConfig | None, optional):
4246 Configuration to use when running evaluations. If none is provided, then
@@ -47,6 +51,19 @@ def run(
4751 Returns:
4852 EvaluationResult: The results of all evaluations performed by Ragas
4953 """
54+ if not dataset :
55+ raise ValueError (
56+ "no dataset was provided, please specify the `dataset` argument"
57+ )
58+ if isinstance (dataset , Path ):
59+ input_ds = EvaluationDataset .from_pandas (
60+ pd .read_json (dataset , lines = True , orient = "records" )
61+ )
62+ elif isinstance (dataset , list ):
63+ input_ds = EvaluationDataset .from_list (dataset )
64+ else :
65+ raise TypeError (f"invalid type passed for dataset: { type (dataset )} " )
66+
5067 if not run_config :
5168 # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
5269 # are horrible and will result in half of our evaluation results being NaN or 0
@@ -57,17 +74,15 @@ def run(
5774 timeout = 3600 ,
5875 )
5976
60- # we will be using gpt-4o for the foreseeable future, we hardcode this
61- # for consistency of answers
62- input_ds = EvaluationDataset .from_list (dataset )
63-
6477 # default set of metrics
6578 metrics = [
6679 RubricsScore (
6780 rubrics = DEFAULT_WITH_REFERENCE_RUBRICS ,
6881 )
6982 ]
7083
84+ # we will be using gpt-4o for the foreseeable future, we hardcode this
85+ # for consistency of answers
7186 critic_lm = ChatOpenAI (model = "gpt-4o" )
7287 results = evaluate (
7388 dataset = input_ds ,
0 commit comments