explodinggradients
diff --git a/‎.github/workflows/claude-code.yaml‎
Lines changed: 22 additions & 14 deletions b/‎.github/workflows/claude-code.yaml‎
Lines changed: 22 additions & 14 deletions
diff --git a/‎docs/experimental/tutorials/agent.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/experimental/tutorials/agent.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/experimental/tutorials/prompt.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/experimental/tutorials/prompt.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/experimental/tutorials/rag.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/experimental/tutorials/rag.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/experimental/tutorials/workflow.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/experimental/tutorials/workflow.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎experimental/ragas_examples/agent_evals/evals.py‎
Lines changed: 2 additions & 2 deletions b/‎experimental/ragas_examples/agent_evals/evals.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎experimental/ragas_examples/prompt_evals/evals.py‎
Lines changed: 2 additions & 2 deletions b/‎experimental/ragas_examples/prompt_evals/evals.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎experimental/ragas_examples/rag_eval/evals.py‎
Lines changed: 3 additions & 3 deletions b/‎experimental/ragas_examples/rag_eval/evals.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎experimental/ragas_examples/workflow_eval/evals.py‎
Lines changed: 3 additions & 3 deletions b/‎experimental/ragas_examples/workflow_eval/evals.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎experimental/ragas_experimental/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎experimental/ragas_experimental/__init__.py‎
Lines changed: 3 additions & 1 deletion
@@ -1,4 +1,4 @@
-name: Claude Code Assistant
+name: Claude PR Assistant
 
 on:
   issue_comment:
@@ -10,21 +10,29 @@ on:
   pull_request_review:
     types: [submitted]
 
-permissions:
-  contents: write
-  issues: write
-  pull-requests: write
-  id-token: write
-
 jobs:
-  claude-response:
-    name: Claude Code Response
+  claude-code-action:
+    if: |
+      (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
+      (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||
+      (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) ||
+      (github.event_name == 'issues' && contains(github.event.issue.body, '@claude'))
     runs-on: ubuntu-latest
-    timeout-minutes: 30
-    if: contains(github.event.comment.body, '@claude') || github.event_name == 'issues' || github.event_name == 'pull_request_review'
+    permissions:
+      contents: read
+      pull-requests: read
+      issues: read
+      id-token: write
     steps:
-      - name: Claude Code Action
-        uses: anthropics/claude-code-action@v1
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Run Claude PR Action
+        uses: anthropics/claude-code-action@beta
         with:
           anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
-          github_token: ${{ secrets.GITHUB_TOKEN }}
+          # Or use OAuth token instead:
+          # claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
+          timeout_minutes: "60"
@@ -41,8 +41,8 @@ df.to_csv("datasets/test_dataset.csv", index=False)
 To evaluate the performance of our agent, we will define a non llm metric that compares if our agent's output is within a certain tolerance of the expected output and outputs 1/0 based on it.
 
 ```python
-from ragas_experimental.metric import numeric_metric
-from ragas_experimental.metric.result import MetricResult
+from ragas_experimental.metrics import numeric_metric
+from ragas_experimental.metrics.result import MetricResult
 
 @numeric_metric(name="correctness")
 def correctness_metric(prediction: float, actual: float):
 
@@ -30,8 +30,8 @@ pd.DataFrame(samples).to_csv("datasets/test_dataset.csv", index=False)
 Now we need to have a way to measure the performance of our prompt in this task. We will define a metric that will compare the output of our prompt with the expected output and outputs pass/fail based on it. 
 
 ```python
-from ragas_experimental.metric import discrete_metric
-from ragas_experimental.metric.result import MetricResult
+from ragas_experimental.metrics import discrete_metric
+from ragas_experimental.metrics.result import MetricResult
 
 @discrete_metric(name="accuracy", values=["pass", "fail"])
 def my_metric(prediction: str, actual: str):
 
@@ -37,7 +37,7 @@ pd.DataFrame(samples).to_csv("datasets/test_dataset.csv", index=False)
 To evaluate the performance of our RAG system, we will define a llm based metric that compares the output of our RAG system with the grading notes and outputs pass/fail based on it.
 
 ```python
-from ragas_experimental.metric import DiscreteMetric
+from ragas_experimental.metrics import DiscreteMetric
 my_metric = DiscreteMetric(
     name="correctness",
     prompt = "Check if the response contains points mentioned from the grading notes and return 'pass' or 'fail'.\nResponse: {response} Grading Notes: {grading_notes}",
 
@@ -37,7 +37,7 @@ pd.DataFrame(dataset_dict).to_csv("datasets/test_dataset.csv", index=False)
 To evaluate the performance of our workflow, we will define a llm based metric that compares the output of our workflow with the pass criteria and outputs pass/fail based on it.
 
 ```python
-from ragas_experimental.metric import DiscreteMetric
+from ragas_experimental.metrics import DiscreteMetric
 
 my_metric = DiscreteMetric(
     name="response_quality",
 
@@ -1,6 +1,6 @@
 from ragas_experimental import Dataset, experiment
-from ragas_experimental.metric.numeric import numeric_metric
-from ragas_experimental.metric.result import MetricResult
+from ragas_experimental.metrics.numeric import numeric_metric
+from ragas_experimental.metrics.result import MetricResult
 from .agent import get_default_agent
 
 math_agent = get_default_agent()
 
@@ -1,6 +1,6 @@
 from ragas_experimental import Dataset, experiment
-from ragas_experimental.metric.result import MetricResult
-from ragas_experimental.metric.discrete import discrete_metric
+from ragas_experimental.metrics.result import MetricResult
+from ragas_experimental.metrics.discrete import discrete_metric
 
 from .prompt import run_prompt
 
 
@@ -1,13 +1,13 @@
 from ragas_experimental import Dataset, experiment
-from ragas_experimental.metric import DiscreteMetric
+from ragas_experimental.metrics import DiscreteMetric
 from openai import OpenAI
-from ragas_experimental.llms import ragas_llm
+from ragas_experimental.llms import llm_factory
 import os
 from .rag import default_rag_client
 
 openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 rag_client = default_rag_client(llm_client=openai_client)
-llm = ragas_llm("openai","gpt-4o", openai_client)
+llm = llm_factory("openai","gpt-4o", openai_client)
 
 def load_dataset():
 
 
@@ -1,13 +1,13 @@
 import os
 from openai import OpenAI
 from ragas_experimental import Dataset, experiment
-from ragas_experimental.metric import DiscreteMetric
-from ragas_experimental.llms import ragas_llm
+from ragas_experimental.metrics import DiscreteMetric
+from ragas_experimental.llms import llm_factory
 from .workflow import default_workflow_client
 
 
 workflow_client = default_workflow_client()
-llm = ragas_llm("openai", "gpt-4o", OpenAI(api_key=os.environ.get("OPENAI_API_KEY")))
+llm = llm_factory("openai", "gpt-4o", OpenAI(api_key=os.environ.get("OPENAI_API_KEY")))
 
 
 
 
@@ -13,5 +13,7 @@
 
 from ragas_experimental.dataset import Dataset
 from ragas_experimental.experiment import experiment, Experiment
+from ragas_experimental.llms import llm_factory
+from ragas_experimental.embeddings import embedding_factory
 
-__all__ = ["Dataset", "experiment", "Experiment"]
+__all__ = ["Dataset", "experiment", "Experiment", "llm_factory", "embedding_factory"]