feat: added upload for EvaluationResult (#1625)

jjmachan · web-flow · commit 3a6fab944df7 · 2024-11-06T15:48:15.000+05:30
diff --git a/docs/howtos/customizations/metrics/write_your_own_metric.ipynb b/docs/howtos/customizations/metrics/write_your_own_metric.ipynb
@@ -62,7 +62,7 @@
    "source": [
     "from ragas.llms import llm_factory\n",
     "\n",
-    "evaluator_llm = llm_factory('gpt-4o')"
+    "evaluator_llm = llm_factory(\"gpt-4o\")"
    ]
   },
   {
@@ -104,7 +104,7 @@
     "hallucinations_binary = AspectCritic(\n",
     "    name=\"hallucinations_binary\",\n",
     "    definition=\"Did the model hallucinate or add any information that was not present in the retrieved context?\",\n",
-    "    llm=evaluator_llm\n",
+    "    llm=evaluator_llm,\n",
     ")\n",
     "\n",
     "await hallucinations_binary.single_turn_ascore(eval_dataset[0])"
@@ -163,9 +163,7 @@
     "from ragas.metrics import RubricsScoreWithoutReference\n",
     "\n",
     "hallucinations_rubric = RubricsScoreWithoutReference(\n",
-    "    name=\"hallucinations_rubric\",\n",
-    "    llm=evaluator_llm,\n",
-    "    rubrics=rubric\n",
+    "    name=\"hallucinations_rubric\", llm=evaluator_llm, rubrics=rubric\n",
     ")\n",
     "\n",
     "await hallucinations_rubric.single_turn_ascore(eval_dataset[0])"
@@ -215,19 +213,28 @@
     "from ragas.callbacks import Callbacks\n",
     "from ragas.dataset_schema import SingleTurnSample\n",
     "\n",
+    "\n",
     "@dataclass\n",
     "class HallucinationsMetric(MetricWithLLM, SingleTurnMetric):\n",
     "    # name of the metric\n",
     "    name: str = \"hallucinations_metric\"\n",
     "    # we need to define the required columns for the metric\n",
-    "    _required_columns: t.Dict[MetricType, t.Set[str]] = field(default_factory=lambda: {MetricType.SINGLE_TURN: {\"user_input\", \"response\", \"retrieved_contexts\"}})\n",
+    "    _required_columns: t.Dict[MetricType, t.Set[str]] = field(\n",
+    "        default_factory=lambda: {\n",
+    "            MetricType.SINGLE_TURN: {\"user_input\", \"response\", \"retrieved_contexts\"}\n",
+    "        }\n",
+    "    )\n",
     "\n",
     "    def __post_init__(self):\n",
     "        # init the faithfulness metric\n",
     "        self.faithfulness_metric = Faithfulness(llm=self.llm)\n",
     "\n",
-    "    async def _single_turn_ascore(self, sample: SingleTurnSample, callbacks: Callbacks) -> float:\n",
-    "        faithfulness_score = await self.faithfulness_metric.single_turn_ascore(sample, callbacks)\n",
+    "    async def _single_turn_ascore(\n",
+    "        self, sample: SingleTurnSample, callbacks: Callbacks\n",
+    "    ) -> float:\n",
+    "        faithfulness_score = await self.faithfulness_metric.single_turn_ascore(\n",
+    "            sample, callbacks\n",
+    "        )\n",
     "        return 1 - faithfulness_score"
    ]
   },
@@ -269,12 +276,8 @@
     "from ragas import evaluate\n",
     "\n",
     "results = evaluate(\n",
-    "    eval_dataset, \n",
-    "    metrics=[\n",
-    "        hallucinations_metric,\n",
-    "        hallucinations_rubric,\n",
-    "        hallucinations_binary\n",
-    "    ], \n",
+    "    eval_dataset,\n",
+    "    metrics=[hallucinations_metric, hallucinations_rubric, hallucinations_binary],\n",
     ")"
    ]
   },
diff --git a/src/ragas/callbacks.py b/src/ragas/callbacks.py
@@ -57,13 +57,13 @@ class ChainType(Enum):
 
 
 class ChainRun(BaseModel):
-    run_id: uuid.UUID
-    parent_run_id: t.Optional[uuid.UUID]
+    run_id: str
+    parent_run_id: t.Optional[str]
     name: str
     inputs: t.Dict[str, t.Any]
     metadata: t.Dict[str, t.Any]
     outputs: t.Dict[str, t.Any] = Field(default_factory=dict)
-    children: t.List[uuid.UUID] = Field(default_factory=list)
+    children: t.List[str] = Field(default_factory=list)
 
 
 class ChainRunEncoder(json.JSONEncoder):
@@ -72,12 +72,14 @@ def default(self, o):
             return str(o)
         if isinstance(o, ChainType):
             return o.value
+        # if isinstance(o, EvaluationResult):
+        #     return ""
         return json.JSONEncoder.default(self, o)
 
 
 @dataclass
 class RagasTracer(BaseCallbackHandler):
-    traces: t.Dict[uuid.UUID, ChainRun] = field(default_factory=dict)
+    traces: t.Dict[str, ChainRun] = field(default_factory=dict)
 
     def on_chain_start(
         self,
@@ -90,17 +92,17 @@ def on_chain_start(
         metadata: t.Optional[t.Dict[str, t.Any]] = None,
         **kwargs: t.Any,
     ) -> t.Any:
-        self.traces[run_id] = ChainRun(
-            run_id=run_id,
-            parent_run_id=parent_run_id,
+        self.traces[str(run_id)] = ChainRun(
+            run_id=str(run_id),
+            parent_run_id=str(parent_run_id) if parent_run_id else None,
             name=serialized["name"],
             inputs=inputs,
             metadata=metadata or {},
             children=[],
         )
 
-        if parent_run_id and parent_run_id in self.traces:
-            self.traces[parent_run_id].children.append(run_id)
+        if parent_run_id and str(parent_run_id) in self.traces:
+            self.traces[str(parent_run_id)].children.append(str(run_id))
 
     def on_chain_end(
         self,
@@ -109,12 +111,11 @@ def on_chain_end(
         run_id: uuid.UUID,
         **kwargs: t.Any,
     ) -> t.Any:
-        self.traces[run_id].outputs = outputs
+        self.traces[str(run_id)].outputs = outputs
 
     def to_jsons(self) -> str:
         return json.dumps(
             [t.model_dump() for t in self.traces.values()],
-            indent=4,
             cls=ChainRunEncoder,
         )
 
@@ -131,7 +132,7 @@ def __str__(self):
 
 
 def parse_run_traces(
-    traces: t.Dict[uuid.UUID, ChainRun],
+    traces: t.Dict[str, ChainRun],
 ) -> t.List[t.Dict[str, t.Any]]:
     root_traces = [
         chain_trace
diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py
@@ -8,13 +8,12 @@
 from datasets import Dataset as HFDataset
 from pydantic import BaseModel, field_validator
 
-from ragas.callbacks import parse_run_traces
+from ragas.callbacks import ChainRunEncoder, parse_run_traces
 from ragas.cost import CostCallbackHandler
 from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
-from ragas.utils import safe_nanmean
+from ragas.utils import RAGAS_API_URL, safe_nanmean
 
 if t.TYPE_CHECKING:
-    import uuid
     from pathlib import Path
 
     from datasets import Dataset as HFDataset
@@ -375,7 +374,7 @@ class EvaluationResult:
     binary_columns: t.List[str] = field(default_factory=list)
     cost_cb: t.Optional[CostCallbackHandler] = None
     traces: t.List[t.Dict[str, t.Any]] = field(default_factory=list)
-    ragas_traces: t.Dict[uuid.UUID, ChainRun] = field(default_factory=dict, repr=False)
+    ragas_traces: t.Dict[str, ChainRun] = field(default_factory=dict, repr=False)
 
     def __post_init__(self):
         # transform scores from list of dicts to dict of lists
@@ -395,6 +394,13 @@ def __post_init__(self):
         # parse the traces
         self.traces = parse_run_traces(self.ragas_traces)
 
+    def __repr__(self) -> str:
+        score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()]
+        return "{" + ", ".join(score_strs) + "}"
+
+    def __getitem__(self, key: str) -> t.List[float]:
+        return self._scores_dict[key]
+
     def to_pandas(self, batch_size: int | None = None, batched: bool = False):
         """
         Convert the result to a pandas DataFrame.
@@ -487,9 +493,36 @@ def total_cost(
             cost_per_input_token, cost_per_output_token, per_model_costs
         )
 
-    def __repr__(self) -> str:
-        score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()]
-        return "{" + ", ".join(score_strs) + "}"
+    def upload(self, base_url: str = RAGAS_API_URL, verbose: bool = True) -> str:
+        from datetime import datetime, timezone
+
+        import requests
+
+        timestamp = datetime.now(timezone.utc).isoformat()
+        root_trace = [
+            trace for trace in self.ragas_traces.values() if trace.parent_run_id is None
+        ][0]
+        packet = json.dumps(
+            {
+                "run_id": str(root_trace.run_id),
+                "created_at": timestamp,
+                "evaluation_run": [t.model_dump() for t in self.ragas_traces.values()],
+            },
+            cls=ChainRunEncoder,
+        )
 
-    def __getitem__(self, key: str) -> t.List[float]:
-        return self._scores_dict[key]
+        response = requests.post(
+            f"{base_url}/alignment/evaluation",
+            data=packet,
+            headers={"Content-Type": "application/json"},
+        )
+
+        if response.status_code != 200:
+            raise Exception(f"Failed to upload results: {response.text}")
+
+        evaluation_endpoint = (
+            f"https://app.ragas.io/alignment/evaluation/{root_trace.run_id}"
+        )
+        if verbose:
+            print(f"Evaluation results uploaded! View at {evaluation_endpoint}")
+        return evaluation_endpoint
diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -344,7 +344,7 @@ def evaluate(
             ragas_traces=tracer.traces,
         )
         if not evaluation_group_cm.ended:
-            evaluation_rm.on_chain_end(result)
+            evaluation_rm.on_chain_end({"scores": result.scores})
     finally:
         # reset llms and embeddings if changed
         for i in llm_changed:
diff --git a/src/ragas/metrics/_context_entities_recall.py b/src/ragas/metrics/_context_entities_recall.py
@@ -23,7 +23,9 @@ class EntitiesList(BaseModel):
 
 class ExtractEntitiesPrompt(PydanticPrompt[StringIO, EntitiesList]):
     name: str = "text_entity_extraction"
-    instruction: str = "Given a text, extract unique entities without repetition. Ensure you consider different forms or mentions of the same entity as a single entity."
+    instruction: str = (
+        "Given a text, extract unique entities without repetition. Ensure you consider different forms or mentions of the same entity as a single entity."
+    )
     input_model = StringIO
     output_model = EntitiesList
     examples = [
diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py
@@ -33,7 +33,9 @@ class Verification(BaseModel):
 
 class ContextPrecisionPrompt(PydanticPrompt[QAC, Verification]):
     name: str = "context_precision"
-    instruction: str = 'Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.'
+    instruction: str = (
+        'Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.'
+    )
     input_model = QAC
     output_model = Verification
     examples = [
@@ -157,17 +159,17 @@ async def _ascore(
         user_input, retrieved_contexts, reference = self._get_row_attributes(row)
         responses = []
         for context in retrieved_contexts:
-            verdicts: t.List[
-                Verification
-            ] = await self.context_precision_prompt.generate_multiple(
-                data=QAC(
-                    question=user_input,
-                    context=context,
-                    answer=reference,
-                ),
-                n=self.reproducibility,
-                llm=self.llm,
-                callbacks=callbacks,
+            verdicts: t.List[Verification] = (
+                await self.context_precision_prompt.generate_multiple(
+                    data=QAC(
+                        question=user_input,
+                        context=context,
+                        answer=reference,
+                    ),
+                    n=self.reproducibility,
+                    llm=self.llm,
+                    callbacks=callbacks,
+                )
             )
 
             responses.append([result.model_dump() for result in verdicts])
diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py
@@ -41,7 +41,9 @@ class ContextRecallClassificationPrompt(
     PydanticPrompt[QCA, ContextRecallClassifications]
 ):
     name: str = "context_recall_classification"
-    instruction: str = "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason."
+    instruction: str = (
+        "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason."
+    )
     input_model = QCA
     output_model = ContextRecallClassifications
     examples = [
@@ -148,17 +150,17 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         assert self.llm is not None, "set LLM before use"
 
         # run classification
-        classifications_list: t.List[
-            ContextRecallClassifications
-        ] = await self.context_recall_prompt.generate_multiple(
-            data=QCA(
-                question=row["user_input"],
-                context="\n".join(row["retrieved_contexts"]),
-                answer=row["reference"],
-            ),
-            llm=self.llm,
-            callbacks=callbacks,
-            n=self.reproducibility,
+        classifications_list: t.List[ContextRecallClassifications] = (
+            await self.context_recall_prompt.generate_multiple(
+                data=QCA(
+                    question=row["user_input"],
+                    context="\n".join(row["retrieved_contexts"]),
+                    answer=row["reference"],
+                ),
+                llm=self.llm,
+                callbacks=callbacks,
+                n=self.reproducibility,
+            )
         )
         classification_dicts = []
         for classification in classifications_list:
diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py
@@ -31,7 +31,9 @@ class AnswersGenerated(BaseModel):
 
 class ExtractKeyphrasePrompt(PydanticPrompt[StringIO, ExtractedKeyphrases]):
     name: str = "extract_keyphrases"
-    instruction: str = "Extract keyphrases of type: Person, Organization, Location, Date/Time, Monetary Values, and Percentages."
+    instruction: str = (
+        "Extract keyphrases of type: Person, Organization, Location, Date/Time, Monetary Values, and Percentages."
+    )
     input_model = StringIO
     output_model = ExtractedKeyphrases
     examples: t.List[t.Tuple[StringIO, ExtractedKeyphrases]] = [
@@ -62,7 +64,9 @@ class GenerateQuestionsPrompt(
     PydanticPrompt[GenerateQuestionsPromptInput, QuestionsGenerated]
 ):
     name: str = "generate_questions"
-    instruction: str = "Based on the given text and keyphrases, generate closed-ended questions that can be answered with '1' if the question can be answered using the text, or '0' if it cannot. The questions should ALWAYS result in a '1' based on the given text."
+    instruction: str = (
+        "Based on the given text and keyphrases, generate closed-ended questions that can be answered with '1' if the question can be answered using the text, or '0' if it cannot. The questions should ALWAYS result in a '1' based on the given text."
+    )
     input_model = GenerateQuestionsPromptInput
     output_model = QuestionsGenerated
     examples: t.List[t.Tuple[GenerateQuestionsPromptInput, QuestionsGenerated]] = [
@@ -99,7 +103,9 @@ class SummaryAndQuestions(BaseModel):
 
 class GenerateAnswersPrompt(PydanticPrompt[SummaryAndQuestions, AnswersGenerated]):
     name: str = "generate_answers"
-    instruction: str = "Based on the list of close-ended '1' or '0' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided summary contains sufficient information to answer EACH question. Answers should STRICTLY be either '1' or '0'. Answer '0' if the provided summary does not contain enough information to answer the question and answer '1' if the provided summary can answer the question."
+    instruction: str = (
+        "Based on the list of close-ended '1' or '0' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided summary contains sufficient information to answer EACH question. Answers should STRICTLY be either '1' or '0'. Answer '0' if the provided summary does not contain enough information to answer the question and answer '1' if the provided summary can answer the question."
+    )
     input_model = SummaryAndQuestions
     output_model = AnswersGenerated
     examples: t.List[t.Tuple[SummaryAndQuestions, AnswersGenerated]] = [
diff --git a/src/ragas/metrics/_topic_adherence.py b/src/ragas/metrics/_topic_adherence.py
@@ -97,7 +97,9 @@ class TopicRefusedPrompt(PydanticPrompt[TopicRefusedInput, TopicRefusedOutput]):
     ]
 
 
-class TopicExtractionPrompt(PydanticPrompt[TopicExtractionInput, TopicExtractionOutput]):
+class TopicExtractionPrompt(
+    PydanticPrompt[TopicExtractionInput, TopicExtractionOutput]
+):
     instruction: str = (
         "Given an interaction between Human, Tool and AI, extract the topics from Human's input."
     )
diff --git a/src/ragas/utils.py b/src/ragas/utils.py
@@ -19,6 +19,8 @@
 RAGAS_SUPPORTED_LANGUAGE_CODES = {
     v.__name__.lower(): k for k, v in LANGUAGE_CODES.items()
 }
+# endpoint for uploading results
+RAGAS_API_URL = "https://api.ragas.io"
 
 
 @lru_cache(maxsize=1)

Original file line number	Diff line number	Diff line change
`@@ -344,7 +344,7 @@ def evaluate(`
`344`	`344`	`ragas_traces=tracer.traces,`
`345`	`345`	`)`
`346`	`346`	`if not evaluation_group_cm.ended:`
`347`		`- evaluation_rm.on_chain_end(result)`
	`347`	`+ evaluation_rm.on_chain_end({"scores": result.scores})`
`348`	`348`	`finally:`
`349`	`349`	`# reset llms and embeddings if changed`
`350`	`350`	`for i in llm_changed:`
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,9 @@ class TopicRefusedPrompt(PydanticPrompt[TopicRefusedInput, TopicRefusedOutput]):`
`97`	`97`	`]`
`98`	`98`
`99`	`99`
`100`		`-class TopicExtractionPrompt(PydanticPrompt[TopicExtractionInput, TopicExtractionOutput]):`
	`100`	`+class TopicExtractionPrompt(`
	`101`	`+ PydanticPrompt[TopicExtractionInput, TopicExtractionOutput]`
	`102`	`+):`
`101`	`103`	`instruction: str = (`
`102`	`104`	`"Given an interaction between Human, Tool and AI, extract the topics from Human's input."`
`103`	`105`	`)`
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@`
`19`	`19`	`RAGAS_SUPPORTED_LANGUAGE_CODES = {`
`20`	`20`	`v.__name__.lower(): k for k, v in LANGUAGE_CODES.items()`
`21`	`21`	`}`
	`22`	`+# endpoint for uploading results`
	`23`	`+RAGAS_API_URL = "https://api.ragas.io"`
`22`	`24`
`23`	`25`
`24`	`26`	`@lru_cache(maxsize=1)`