refactor: experiment function to use model as additional optional arg (#2165)

anistark · web-flow · commit 500219e4f870 · 2025-08-15T14:32:44.000+05:30
diff --git a/docs/experimental/core_concepts/experimentation.md b/docs/experimental/core_concepts/experimentation.md
@@ -60,6 +60,109 @@ async def my_experiment(row):
 my_experiment.arun(dataset)
 ```
 
+### Passing Additional Parameters
+
+You can pass additional parameters to your experiment function through `arun()`. This is useful for models, configurations, or any other parameters your experiment needs:
+
+```python
+@experiment
+async def my_experiment(row, model):
+    # Process the query with the specified parameters
+    response = my_app(row.query, model=model)
+    
+    # Calculate the metric
+    metric = my_metric.score(response, row.ground_truth)
+    
+    # Return results
+    return {**row, "response": response, "accuracy": metric.value}
+
+# Run with specific parameters
+my_experiment.arun(dataset, "gpt-4")
+
+# Or use keyword arguments
+my_experiment.arun(dataset, model="gpt-4o")
+```
+
+### Using Data Models
+
+You can specify a data model for your experiment results at the decorator level:
+
+```python
+from pydantic import BaseModel
+
+class ExperimentResult(BaseModel):
+    response: str
+    accuracy: float
+    model_used: str
+
+@experiment(experiment_model=ExperimentResult)
+async def my_experiment(row, model):
+    response = my_app(row.query, model)
+    metric = my_metric.score(response, row.ground_truth)
+    return ExperimentResult(
+        response=response, 
+        accuracy=metric.value, 
+        model_used=model
+    )
+
+# Run experiment with specific model
+my_experiment.arun(dataset, model="gpt-4o")
+```
+
+### Complete Example: LLM Parameter Passing
+
+Here's a complete example showing how to pass different LLM models to your experiment function:
+
+```python
+from pydantic import BaseModel
+from ragas.experimental import experiment, Dataset
+
+class ExperimentResult(BaseModel):
+    query: str
+    response: str
+    accuracy: float
+    model_used: str
+    latency_ms: float
+
+@experiment(experiment_model=ExperimentResult)
+async def llm_experiment(row, llm_model, temperature=0.7):
+    """Experiment function that accepts LLM model and other parameters."""
+    import time
+    start_time = time.time()
+    
+    # Use the passed LLM model
+    response = await my_llm_app(
+        query=row.query, 
+        model=llm_model,
+        temperature=temperature
+    )
+    
+    # Calculate metrics
+    metric = my_metric.score(response, row.ground_truth)
+    end_time = time.time()
+    
+    return ExperimentResult(
+        query=row.query,
+        response=response,
+        accuracy=metric.value,
+        model_used=llm_model,
+        latency_ms=(end_time - start_time) * 1000
+    )
+
+# Run experiments with different models
+gpt4_results = await llm_experiment.arun(
+    dataset, 
+    llm_model="gpt-4o",
+    temperature=0.1
+)
+
+claude_results = await llm_experiment.arun(
+    dataset,
+    llm_model="claude-4-sonnet",
+    temperature=0.7
+)
+```
+
 ## Result Storage
 
 Once executed, Ragas processes each row in the dataset, runs it through the function, and stores the results in the `experiments` folder. The storage backend can be configured based on your preferences.
diff --git a/docs/experimental/tutorials/prompt.md b/docs/experimental/tutorials/prompt.md
@@ -74,7 +74,34 @@ async def run_experiment(row):
     return experiment_view
 ```
 
-Now whenever you make a change to your prompt, you can run the experiment and see how it affects the performance of your prompt. 
+Now whenever you make a change to your prompt, you can run the experiment and see how it affects the performance of your prompt.
+
+### Passing Additional Parameters
+
+You can pass additional parameters like models or configurations to your experiment function:
+
+```python
+@experiment()
+async def run_experiment(row, model):
+    response = run_prompt(row["text"], model=model)
+    score = my_metric.score(
+        prediction=response,
+        actual=row["label"]
+    )
+
+    experiment_view = {
+        **row,
+        "response": response,
+        "score": score.result,
+    }
+    return experiment_view
+
+# Run with specific parameters
+run_experiment.arun(dataset, "gpt-4")
+
+# Or use keyword arguments
+run_experiment.arun(dataset, model="gpt-4o")
+``` 
 
 
 ## Running the example end to end
diff --git a/ragas/src/ragas/experimental/backends/registry.py b/ragas/src/ragas/experimental/backends/registry.py
@@ -120,7 +120,11 @@ def _discover_backends(self) -> None:
                 entry_points = entry_points_result.select(group="ragas.backends")
             else:
                 # Python 3.9 compatibility
-                entry_points = entry_points_result.get("ragas.backends", [])
+                entry_points = (
+                    entry_points_result.get("ragas.backends", [])
+                    if isinstance(entry_points_result, dict)
+                    else []
+                )
 
             for entry_point in entry_points:
                 try:
diff --git a/ragas/src/ragas/experimental/experiment.py b/ragas/src/ragas/experimental/experiment.py
@@ -81,6 +81,8 @@ async def arun(
         dataset: Dataset,
         name: t.Optional[str] = None,
         backend: t.Optional[t.Union[BaseBackend, str]] = None,
+        *args,
+        **kwargs,
     ) -> "Experiment": ...
 
 
@@ -114,6 +116,8 @@ async def arun(
         dataset: Dataset,
         name: t.Optional[str] = None,
         backend: t.Optional[t.Union[BaseBackend, str]] = None,
+        *args,
+        **kwargs,
     ) -> "Experiment":
         """Run the experiment against a dataset."""
         # Generate name if not provided
@@ -139,7 +143,7 @@ async def arun(
         # Create tasks for all items
         tasks = []
         for item in dataset:
-            tasks.append(self(item))
+            tasks.append(self(item, *args, **kwargs))
 
         progress_bar = None
         try: