Skip to content

Commit 3a6fab9

Browse files
authored
feat: added upload for EvaluationResult (#1625)
1 parent 6ff35f7 commit 3a6fab9

File tree

10 files changed

+118
-65
lines changed

10 files changed

+118
-65
lines changed

docs/howtos/customizations/metrics/write_your_own_metric.ipynb

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
"source": [
6363
"from ragas.llms import llm_factory\n",
6464
"\n",
65-
"evaluator_llm = llm_factory('gpt-4o')"
65+
"evaluator_llm = llm_factory(\"gpt-4o\")"
6666
]
6767
},
6868
{
@@ -104,7 +104,7 @@
104104
"hallucinations_binary = AspectCritic(\n",
105105
" name=\"hallucinations_binary\",\n",
106106
" definition=\"Did the model hallucinate or add any information that was not present in the retrieved context?\",\n",
107-
" llm=evaluator_llm\n",
107+
" llm=evaluator_llm,\n",
108108
")\n",
109109
"\n",
110110
"await hallucinations_binary.single_turn_ascore(eval_dataset[0])"
@@ -163,9 +163,7 @@
163163
"from ragas.metrics import RubricsScoreWithoutReference\n",
164164
"\n",
165165
"hallucinations_rubric = RubricsScoreWithoutReference(\n",
166-
" name=\"hallucinations_rubric\",\n",
167-
" llm=evaluator_llm,\n",
168-
" rubrics=rubric\n",
166+
" name=\"hallucinations_rubric\", llm=evaluator_llm, rubrics=rubric\n",
169167
")\n",
170168
"\n",
171169
"await hallucinations_rubric.single_turn_ascore(eval_dataset[0])"
@@ -215,19 +213,28 @@
215213
"from ragas.callbacks import Callbacks\n",
216214
"from ragas.dataset_schema import SingleTurnSample\n",
217215
"\n",
216+
"\n",
218217
"@dataclass\n",
219218
"class HallucinationsMetric(MetricWithLLM, SingleTurnMetric):\n",
220219
" # name of the metric\n",
221220
" name: str = \"hallucinations_metric\"\n",
222221
" # we need to define the required columns for the metric\n",
223-
" _required_columns: t.Dict[MetricType, t.Set[str]] = field(default_factory=lambda: {MetricType.SINGLE_TURN: {\"user_input\", \"response\", \"retrieved_contexts\"}})\n",
222+
" _required_columns: t.Dict[MetricType, t.Set[str]] = field(\n",
223+
" default_factory=lambda: {\n",
224+
" MetricType.SINGLE_TURN: {\"user_input\", \"response\", \"retrieved_contexts\"}\n",
225+
" }\n",
226+
" )\n",
224227
"\n",
225228
" def __post_init__(self):\n",
226229
" # init the faithfulness metric\n",
227230
" self.faithfulness_metric = Faithfulness(llm=self.llm)\n",
228231
"\n",
229-
" async def _single_turn_ascore(self, sample: SingleTurnSample, callbacks: Callbacks) -> float:\n",
230-
" faithfulness_score = await self.faithfulness_metric.single_turn_ascore(sample, callbacks)\n",
232+
" async def _single_turn_ascore(\n",
233+
" self, sample: SingleTurnSample, callbacks: Callbacks\n",
234+
" ) -> float:\n",
235+
" faithfulness_score = await self.faithfulness_metric.single_turn_ascore(\n",
236+
" sample, callbacks\n",
237+
" )\n",
231238
" return 1 - faithfulness_score"
232239
]
233240
},
@@ -269,12 +276,8 @@
269276
"from ragas import evaluate\n",
270277
"\n",
271278
"results = evaluate(\n",
272-
" eval_dataset, \n",
273-
" metrics=[\n",
274-
" hallucinations_metric,\n",
275-
" hallucinations_rubric,\n",
276-
" hallucinations_binary\n",
277-
" ], \n",
279+
" eval_dataset,\n",
280+
" metrics=[hallucinations_metric, hallucinations_rubric, hallucinations_binary],\n",
278281
")"
279282
]
280283
},

src/ragas/callbacks.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,13 @@ class ChainType(Enum):
5757

5858

5959
class ChainRun(BaseModel):
60-
run_id: uuid.UUID
61-
parent_run_id: t.Optional[uuid.UUID]
60+
run_id: str
61+
parent_run_id: t.Optional[str]
6262
name: str
6363
inputs: t.Dict[str, t.Any]
6464
metadata: t.Dict[str, t.Any]
6565
outputs: t.Dict[str, t.Any] = Field(default_factory=dict)
66-
children: t.List[uuid.UUID] = Field(default_factory=list)
66+
children: t.List[str] = Field(default_factory=list)
6767

6868

6969
class ChainRunEncoder(json.JSONEncoder):
@@ -72,12 +72,14 @@ def default(self, o):
7272
return str(o)
7373
if isinstance(o, ChainType):
7474
return o.value
75+
# if isinstance(o, EvaluationResult):
76+
# return ""
7577
return json.JSONEncoder.default(self, o)
7678

7779

7880
@dataclass
7981
class RagasTracer(BaseCallbackHandler):
80-
traces: t.Dict[uuid.UUID, ChainRun] = field(default_factory=dict)
82+
traces: t.Dict[str, ChainRun] = field(default_factory=dict)
8183

8284
def on_chain_start(
8385
self,
@@ -90,17 +92,17 @@ def on_chain_start(
9092
metadata: t.Optional[t.Dict[str, t.Any]] = None,
9193
**kwargs: t.Any,
9294
) -> t.Any:
93-
self.traces[run_id] = ChainRun(
94-
run_id=run_id,
95-
parent_run_id=parent_run_id,
95+
self.traces[str(run_id)] = ChainRun(
96+
run_id=str(run_id),
97+
parent_run_id=str(parent_run_id) if parent_run_id else None,
9698
name=serialized["name"],
9799
inputs=inputs,
98100
metadata=metadata or {},
99101
children=[],
100102
)
101103

102-
if parent_run_id and parent_run_id in self.traces:
103-
self.traces[parent_run_id].children.append(run_id)
104+
if parent_run_id and str(parent_run_id) in self.traces:
105+
self.traces[str(parent_run_id)].children.append(str(run_id))
104106

105107
def on_chain_end(
106108
self,
@@ -109,12 +111,11 @@ def on_chain_end(
109111
run_id: uuid.UUID,
110112
**kwargs: t.Any,
111113
) -> t.Any:
112-
self.traces[run_id].outputs = outputs
114+
self.traces[str(run_id)].outputs = outputs
113115

114116
def to_jsons(self) -> str:
115117
return json.dumps(
116118
[t.model_dump() for t in self.traces.values()],
117-
indent=4,
118119
cls=ChainRunEncoder,
119120
)
120121

@@ -131,7 +132,7 @@ def __str__(self):
131132

132133

133134
def parse_run_traces(
134-
traces: t.Dict[uuid.UUID, ChainRun],
135+
traces: t.Dict[str, ChainRun],
135136
) -> t.List[t.Dict[str, t.Any]]:
136137
root_traces = [
137138
chain_trace

src/ragas/dataset_schema.py

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,12 @@
88
from datasets import Dataset as HFDataset
99
from pydantic import BaseModel, field_validator
1010

11-
from ragas.callbacks import parse_run_traces
11+
from ragas.callbacks import ChainRunEncoder, parse_run_traces
1212
from ragas.cost import CostCallbackHandler
1313
from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
14-
from ragas.utils import safe_nanmean
14+
from ragas.utils import RAGAS_API_URL, safe_nanmean
1515

1616
if t.TYPE_CHECKING:
17-
import uuid
1817
from pathlib import Path
1918

2019
from datasets import Dataset as HFDataset
@@ -375,7 +374,7 @@ class EvaluationResult:
375374
binary_columns: t.List[str] = field(default_factory=list)
376375
cost_cb: t.Optional[CostCallbackHandler] = None
377376
traces: t.List[t.Dict[str, t.Any]] = field(default_factory=list)
378-
ragas_traces: t.Dict[uuid.UUID, ChainRun] = field(default_factory=dict, repr=False)
377+
ragas_traces: t.Dict[str, ChainRun] = field(default_factory=dict, repr=False)
379378

380379
def __post_init__(self):
381380
# transform scores from list of dicts to dict of lists
@@ -395,6 +394,13 @@ def __post_init__(self):
395394
# parse the traces
396395
self.traces = parse_run_traces(self.ragas_traces)
397396

397+
def __repr__(self) -> str:
398+
score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()]
399+
return "{" + ", ".join(score_strs) + "}"
400+
401+
def __getitem__(self, key: str) -> t.List[float]:
402+
return self._scores_dict[key]
403+
398404
def to_pandas(self, batch_size: int | None = None, batched: bool = False):
399405
"""
400406
Convert the result to a pandas DataFrame.
@@ -487,9 +493,36 @@ def total_cost(
487493
cost_per_input_token, cost_per_output_token, per_model_costs
488494
)
489495

490-
def __repr__(self) -> str:
491-
score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()]
492-
return "{" + ", ".join(score_strs) + "}"
496+
def upload(self, base_url: str = RAGAS_API_URL, verbose: bool = True) -> str:
497+
from datetime import datetime, timezone
498+
499+
import requests
500+
501+
timestamp = datetime.now(timezone.utc).isoformat()
502+
root_trace = [
503+
trace for trace in self.ragas_traces.values() if trace.parent_run_id is None
504+
][0]
505+
packet = json.dumps(
506+
{
507+
"run_id": str(root_trace.run_id),
508+
"created_at": timestamp,
509+
"evaluation_run": [t.model_dump() for t in self.ragas_traces.values()],
510+
},
511+
cls=ChainRunEncoder,
512+
)
493513

494-
def __getitem__(self, key: str) -> t.List[float]:
495-
return self._scores_dict[key]
514+
response = requests.post(
515+
f"{base_url}/alignment/evaluation",
516+
data=packet,
517+
headers={"Content-Type": "application/json"},
518+
)
519+
520+
if response.status_code != 200:
521+
raise Exception(f"Failed to upload results: {response.text}")
522+
523+
evaluation_endpoint = (
524+
f"https://app.ragas.io/alignment/evaluation/{root_trace.run_id}"
525+
)
526+
if verbose:
527+
print(f"Evaluation results uploaded! View at {evaluation_endpoint}")
528+
return evaluation_endpoint

src/ragas/evaluation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def evaluate(
344344
ragas_traces=tracer.traces,
345345
)
346346
if not evaluation_group_cm.ended:
347-
evaluation_rm.on_chain_end(result)
347+
evaluation_rm.on_chain_end({"scores": result.scores})
348348
finally:
349349
# reset llms and embeddings if changed
350350
for i in llm_changed:

src/ragas/metrics/_context_entities_recall.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ class EntitiesList(BaseModel):
2323

2424
class ExtractEntitiesPrompt(PydanticPrompt[StringIO, EntitiesList]):
2525
name: str = "text_entity_extraction"
26-
instruction: str = "Given a text, extract unique entities without repetition. Ensure you consider different forms or mentions of the same entity as a single entity."
26+
instruction: str = (
27+
"Given a text, extract unique entities without repetition. Ensure you consider different forms or mentions of the same entity as a single entity."
28+
)
2729
input_model = StringIO
2830
output_model = EntitiesList
2931
examples = [

src/ragas/metrics/_context_precision.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ class Verification(BaseModel):
3333

3434
class ContextPrecisionPrompt(PydanticPrompt[QAC, Verification]):
3535
name: str = "context_precision"
36-
instruction: str = 'Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.'
36+
instruction: str = (
37+
'Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.'
38+
)
3739
input_model = QAC
3840
output_model = Verification
3941
examples = [
@@ -157,17 +159,17 @@ async def _ascore(
157159
user_input, retrieved_contexts, reference = self._get_row_attributes(row)
158160
responses = []
159161
for context in retrieved_contexts:
160-
verdicts: t.List[
161-
Verification
162-
] = await self.context_precision_prompt.generate_multiple(
163-
data=QAC(
164-
question=user_input,
165-
context=context,
166-
answer=reference,
167-
),
168-
n=self.reproducibility,
169-
llm=self.llm,
170-
callbacks=callbacks,
162+
verdicts: t.List[Verification] = (
163+
await self.context_precision_prompt.generate_multiple(
164+
data=QAC(
165+
question=user_input,
166+
context=context,
167+
answer=reference,
168+
),
169+
n=self.reproducibility,
170+
llm=self.llm,
171+
callbacks=callbacks,
172+
)
171173
)
172174

173175
responses.append([result.model_dump() for result in verdicts])

src/ragas/metrics/_context_recall.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ class ContextRecallClassificationPrompt(
4141
PydanticPrompt[QCA, ContextRecallClassifications]
4242
):
4343
name: str = "context_recall_classification"
44-
instruction: str = "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason."
44+
instruction: str = (
45+
"Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason."
46+
)
4547
input_model = QCA
4648
output_model = ContextRecallClassifications
4749
examples = [
@@ -148,17 +150,17 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
148150
assert self.llm is not None, "set LLM before use"
149151

150152
# run classification
151-
classifications_list: t.List[
152-
ContextRecallClassifications
153-
] = await self.context_recall_prompt.generate_multiple(
154-
data=QCA(
155-
question=row["user_input"],
156-
context="\n".join(row["retrieved_contexts"]),
157-
answer=row["reference"],
158-
),
159-
llm=self.llm,
160-
callbacks=callbacks,
161-
n=self.reproducibility,
153+
classifications_list: t.List[ContextRecallClassifications] = (
154+
await self.context_recall_prompt.generate_multiple(
155+
data=QCA(
156+
question=row["user_input"],
157+
context="\n".join(row["retrieved_contexts"]),
158+
answer=row["reference"],
159+
),
160+
llm=self.llm,
161+
callbacks=callbacks,
162+
n=self.reproducibility,
163+
)
162164
)
163165
classification_dicts = []
164166
for classification in classifications_list:

src/ragas/metrics/_summarization.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ class AnswersGenerated(BaseModel):
3131

3232
class ExtractKeyphrasePrompt(PydanticPrompt[StringIO, ExtractedKeyphrases]):
3333
name: str = "extract_keyphrases"
34-
instruction: str = "Extract keyphrases of type: Person, Organization, Location, Date/Time, Monetary Values, and Percentages."
34+
instruction: str = (
35+
"Extract keyphrases of type: Person, Organization, Location, Date/Time, Monetary Values, and Percentages."
36+
)
3537
input_model = StringIO
3638
output_model = ExtractedKeyphrases
3739
examples: t.List[t.Tuple[StringIO, ExtractedKeyphrases]] = [
@@ -62,7 +64,9 @@ class GenerateQuestionsPrompt(
6264
PydanticPrompt[GenerateQuestionsPromptInput, QuestionsGenerated]
6365
):
6466
name: str = "generate_questions"
65-
instruction: str = "Based on the given text and keyphrases, generate closed-ended questions that can be answered with '1' if the question can be answered using the text, or '0' if it cannot. The questions should ALWAYS result in a '1' based on the given text."
67+
instruction: str = (
68+
"Based on the given text and keyphrases, generate closed-ended questions that can be answered with '1' if the question can be answered using the text, or '0' if it cannot. The questions should ALWAYS result in a '1' based on the given text."
69+
)
6670
input_model = GenerateQuestionsPromptInput
6771
output_model = QuestionsGenerated
6872
examples: t.List[t.Tuple[GenerateQuestionsPromptInput, QuestionsGenerated]] = [
@@ -99,7 +103,9 @@ class SummaryAndQuestions(BaseModel):
99103

100104
class GenerateAnswersPrompt(PydanticPrompt[SummaryAndQuestions, AnswersGenerated]):
101105
name: str = "generate_answers"
102-
instruction: str = "Based on the list of close-ended '1' or '0' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided summary contains sufficient information to answer EACH question. Answers should STRICTLY be either '1' or '0'. Answer '0' if the provided summary does not contain enough information to answer the question and answer '1' if the provided summary can answer the question."
106+
instruction: str = (
107+
"Based on the list of close-ended '1' or '0' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided summary contains sufficient information to answer EACH question. Answers should STRICTLY be either '1' or '0'. Answer '0' if the provided summary does not contain enough information to answer the question and answer '1' if the provided summary can answer the question."
108+
)
103109
input_model = SummaryAndQuestions
104110
output_model = AnswersGenerated
105111
examples: t.List[t.Tuple[SummaryAndQuestions, AnswersGenerated]] = [

src/ragas/metrics/_topic_adherence.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,9 @@ class TopicRefusedPrompt(PydanticPrompt[TopicRefusedInput, TopicRefusedOutput]):
9797
]
9898

9999

100-
class TopicExtractionPrompt(PydanticPrompt[TopicExtractionInput, TopicExtractionOutput]):
100+
class TopicExtractionPrompt(
101+
PydanticPrompt[TopicExtractionInput, TopicExtractionOutput]
102+
):
101103
instruction: str = (
102104
"Given an interaction between Human, Tool and AI, extract the topics from Human's input."
103105
)

src/ragas/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
RAGAS_SUPPORTED_LANGUAGE_CODES = {
2020
v.__name__.lower(): k for k, v in LANGUAGE_CODES.items()
2121
}
22+
# endpoint for uploading results
23+
RAGAS_API_URL = "https://api.ragas.io"
2224

2325

2426
@lru_cache(maxsize=1)

0 commit comments

Comments
 (0)