Skip to content

Commit 3443ffa

Browse files
committed
adds basic ragas eval
Signed-off-by: Oleg S <[email protected]>
1 parent 2e7e405 commit 3443ffa

File tree

2 files changed

+81
-0
lines changed

2 files changed

+81
-0
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ pandas
1010
pandas-stubs
1111
lm-eval>=0.4.4
1212
httpx
13+
ragas

src/instructlab/eval/ragas.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Standard
2+
from typing import List, TypedDict
3+
4+
# Third Party
5+
from langchain_community.chat_models import ChatOpenAI
6+
from ragas.evaluation import EvaluationDataset, EvaluationResult, RunConfig, evaluate
7+
from ragas.metrics import RubricsScore
8+
from ragas.metrics._domain_specific_rubrics import DEFAULT_WITH_REFERENCE_RUBRICS
9+
10+
# Local
11+
from .evaluator import Evaluator
12+
13+
14+
class Sample(TypedDict):
15+
# question
16+
user_input: str
17+
18+
# model answer
19+
response: str
20+
21+
# golden answer
22+
reference: str
23+
24+
25+
class RagasEvaluator(Evaluator):
26+
# most basic implementation, we just assume that the user will bring the existing model responses
27+
name = "ragas"
28+
29+
def __init__(self):
30+
pass
31+
32+
def run(
33+
self, dataset: List[Sample], run_config: RunConfig | None = None
34+
) -> EvaluationResult:
35+
"""
36+
Evaluates the quality of model responses against a graded rubric.
37+
38+
Args:
39+
dataset (List[Sample]):
40+
List of model questions and answers
41+
run_config (RunConfig | None, optional):
42+
Configuration to use when running evaluations. If none is provided, then
43+
a default one is created containing extremely permissive settings when handling
44+
timeouts. This is because by default, OpenAI tier-1 usage accounts have very high
45+
rate limits resulting in heavy throttling during evaluations.
46+
47+
Returns:
48+
EvaluationResult: The results of all evaluations performed by Ragas
49+
"""
50+
if not run_config:
51+
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
52+
# are horrible and will result in half of our evaluation results being NaN or 0
53+
run_config = RunConfig(
54+
max_retries=120,
55+
max_wait=7200,
56+
seed=42,
57+
timeout=3600,
58+
)
59+
60+
# we will be using gpt-4o for the foreseeable future, we hardcode this
61+
# for consistency of answers
62+
input_ds = EvaluationDataset.from_list(dataset)
63+
64+
# default set of metrics
65+
metrics = [
66+
RubricsScore(
67+
rubrics=DEFAULT_WITH_REFERENCE_RUBRICS,
68+
)
69+
]
70+
71+
critic_lm = ChatOpenAI(model="gpt-4o")
72+
results = evaluate(
73+
dataset=input_ds,
74+
batch_size=4,
75+
run_config=run_config,
76+
llm=critic_lm,
77+
metrics=metrics,
78+
show_progress=True,
79+
)
80+
return results

0 commit comments

Comments
 (0)