explodinggradients
diff --git a/‎experimental/ragas_experimental/cli.py‎
Lines changed: 1 addition & 1 deletion b/‎experimental/ragas_experimental/cli.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎experimental/ragas_experimental/dataset.py‎
Lines changed: 15 additions & 0 deletions b/‎experimental/ragas_experimental/dataset.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎experimental/ragas_experimental/metric/__init__.py‎
Lines changed: 0 additions & 13 deletions b/‎experimental/ragas_experimental/metric/__init__.py‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎experimental/ragas_experimental/metric/result.py‎
Lines changed: 0 additions & 248 deletions b/‎experimental/ragas_experimental/metric/result.py‎
Lines changed: 0 additions & 248 deletions
diff --git a/‎experimental/ragas_experimental/metrics/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎experimental/ragas_experimental/metrics/__init__.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎experimental/ragas_experimental/metric/base.py‎ renamed to ‎experimental/ragas_experimental/metrics/base.py‎
Lines changed: 45 additions & 3 deletions b/‎experimental/ragas_experimental/metric/base.py‎ renamed to ‎experimental/ragas_experimental/metrics/base.py‎
Lines changed: 45 additions & 3 deletions
@@ -550,7 +550,7 @@ def hello_world(
 )
 
 
-@numeric_metric(name="accuracy_score", range=(0, 1))
+@numeric_metric(name="accuracy_score", allowed_values=(0, 1))
 def accuracy_score(response: str, expected: str):
     """
     Is the response a good response to the query?
 
@@ -211,6 +211,21 @@ def save(self) -> None:
         else:
             self.backend.save_dataset(self.name, dict_data, data_model=self.data_model)
 
+    def reload(self) -> None:
+        # Backend always returns dicts
+        # Use the correct backend method based on the class type
+        if hasattr(self, "DATATABLE_TYPE") and self.DATATABLE_TYPE == "Experiment":
+            dict_data = self.backend.load_experiment(self.name)
+        else:
+            dict_data = self.backend.load_dataset(self.name)
+
+        if self.data_model:
+            # Validated mode - convert dicts to Pydantic models
+            self._data = [self.data_model(**d) for d in dict_data]
+        else:
+            # Unvalidated mode - keep as dicts but wrapped in Dataset API
+            self._data = dict_data  # type: ignore
+
     def validate_with(self, data_model: t.Type[T]) -> Self:
         """Apply validation to an unvalidated dataset"""
         if self.data_model is not None:
 
@@ -0,0 +1,16 @@
+from .base import Metric
+from .discrete import DiscreteMetric, discrete_metric
+from .numeric import NumericMetric, numeric_metric
+from .ranking import RankingMetric, ranking_metric
+from .result import MetricResult
+
+__all__ = [
+    "MetricResult",
+    "Metric",
+    "DiscreteMetric",
+    "NumericMetric",
+    "RankingMetric",
+    "discrete_metric",
+    "numeric_metric",
+    "ranking_metric",
+]
@@ -21,6 +21,37 @@
     from ragas_experimental.dataset import Dataset
 
 
+@dataclass
+class BaseMetric(ABC):
+    name: str
+
+    @abstractmethod
+    def score(self, **kwargs) -> MetricResult:
+        pass
+
+    @abstractmethod
+    async def ascore(self, **kwargs) -> MetricResult:
+        pass
+
+    def batch_score(
+        self,
+        inputs: t.List[t.Dict[str, t.Any]],
+    ) -> t.List[MetricResult]:
+        return [self.score(**input_dict) for input_dict in inputs]
+
+    async def abatch_score(
+        self,
+        inputs: t.List[t.Dict[str, t.Any]],
+    ) -> t.List[MetricResult]:
+        async_tasks = []
+        for input_dict in inputs:
+            # Process input asynchronously
+            async_tasks.append(self.ascore(**input_dict))
+
+        # Run all tasks concurrently and return results
+        return await asyncio.gather(*async_tasks)
+
+
 @dataclass
 class Metric(ABC):
     """Base class for all metrics in the LLM evaluation library."""
@@ -48,7 +79,12 @@ def get_variables(self) -> t.List[str]:
     def score(self, llm: RagasLLM, **kwargs) -> MetricResult:
         traces = {}
         traces["input"] = kwargs
+
+        # get prompt
+        if not self.prompt:
+            raise Exception("prompt not passed")
         prompt_input = self.prompt.format(**kwargs)
+
         response = llm.generate(prompt_input, response_model=self._response_model)
         traces["output"] = response.model_dump()
         result = MetricResult(**response.model_dump())
@@ -58,7 +94,11 @@ def score(self, llm: RagasLLM, **kwargs) -> MetricResult:
     async def ascore(self, llm: RagasLLM, **kwargs) -> MetricResult:
         traces = {}
 
+        # get prompt
+        if not self.prompt:
+            raise Exception("prompt not passed")
         prompt_input = self.prompt.format(**kwargs)
+
         traces["input"] = prompt_input
         response = await llm.agenerate(
             prompt_input,
@@ -137,11 +177,13 @@ def align(
         Align the metric with the specified experiments by different optimization methods.
         """
 
-        assert isinstance(self.prompt, Prompt)
+        # get prompt
+        if not self.prompt:
+            raise Exception("prompt not passed")
         self.prompt = DynamicFewShotPrompt.from_prompt(
             self.prompt, embedding_model, **kwargs
         )
-        dataset.load()
+        dataset.reload()
         total_items = len(dataset)
         input_vars = self.get_variables()
         output_vars = [self.name, f"{self.name}_reason"]
@@ -188,7 +230,7 @@ def validate_alignment(
                 for v in self.get_variables()
             }
             score = self.score(llm=llm, **values)
-            pred_scores.append(score.result)
+            pred_scores.append(score.value)
 
         df = test_dataset.to_pandas()
         df[f"{self.name}_pred"] = pred_scores
Original file line number	Diff line number	Diff line change
`@@ -550,7 +550,7 @@ def hello_world(`
`550`	`550`	`)`
`551`	`551`
`552`	`552`
`553`		`-@numeric_metric(name="accuracy_score", range=(0, 1))`
	`553`	`+@numeric_metric(name="accuracy_score", allowed_values=(0, 1))`
`554`	`554`	`def accuracy_score(response: str, expected: str):`
`555`	`555`	`"""`
`556`	`556`	`Is the response a good response to the query?`