Agenta-AI · jp-agenta · Nov 6, 2025 · Nov 5, 2025 · Nov 6, 2025
diff --git a/README.md b/README.md
@@ -91,7 +91,7 @@ Collaborate with Subject Matter Experts (SMEs) on prompt engineering and make su
 
 ### 📊 Evaluation & Testing
 Evaluate your LLM applications systematically with both human and automated feedback.
-- **Flexible Test Sets**: Create test cases from production data, playground experiments, or upload CSVs
+- **Flexible Testsets**: Create testcases from production data, playground experiments, or upload CSVs
 - **Pre-built and Custom Evaluators**: Use LLM-as-judge, one of our 20+ pre-built evaluators, or you custom evaluators
 - **UI and API Access**: Run evaluations via UI (for SMEs) or programmatically (for engineers)
 - **Human Feedback Integration**: Collect and incorporate expert annotations

diff --git a/api/ee/databases/postgres/migrations/core/data_migrations/projects.py b/api/ee/databases/postgres/migrations/core/data_migrations/projects.py
@@ -21,7 +21,7 @@
     EvaluatorConfigDB,
     HumanEvaluationDB,
     HumanEvaluationScenarioDB,
-    TestSetDB,
+    TestsetDB,
 )
 
 
@@ -38,7 +38,7 @@
     EvaluatorConfigDB,  # have workspace_id
     HumanEvaluationDB,  # have workspace_id
     HumanEvaluationScenarioDB,  # have workspace_id
-    TestSetDB,  # have workspace_id
+    TestsetDB,  # have workspace_id
 ]
 
 

diff --git a/api/ee/databases/postgres/migrations/core/data_migrations/testsets.py b/api/ee/databases/postgres/migrations/core/data_migrations/testsets.py
@@ -21,7 +21,7 @@
 )
 from oss.src.dbs.postgres.git.dao import GitDAO
 from oss.src.core.testcases.service import TestcasesService
-from oss.src.models.deprecated_models import DeprecatedTestSetDB
+from oss.src.models.deprecated_models import DeprecatedTestsetDB
 from oss.src.core.testsets.service import TestsetsService, SimpleTestsetsService
 
 
@@ -81,8 +81,8 @@ async def migration_old_testsets_to_new_testsets(
         # Count total rows with a non-null project_id
         total_query = (
             select(func.count())
-            .select_from(DeprecatedTestSetDB)
-            .filter(DeprecatedTestSetDB.project_id.isnot(None))
+            .select_from(DeprecatedTestsetDB)
+            .filter(DeprecatedTestsetDB.project_id.isnot(None))
         )
         result = await connection.execute(total_query)
         total_rows = result.scalar()
@@ -98,8 +98,8 @@ async def migration_old_testsets_to_new_testsets(
         while offset < total_testsets:
             # STEP 1: Fetch evaluator configurations with non-null project_id
             result = await connection.execute(
-                select(DeprecatedTestSetDB)
-                .filter(DeprecatedTestSetDB.project_id.isnot(None))
+                select(DeprecatedTestsetDB)
+                .filter(DeprecatedTestsetDB.project_id.isnot(None))
                 .offset(offset)
                 .limit(DEFAULT_BATCH_SIZE)
             )

diff --git a/api/ee/docker/Dockerfile.dev b/api/ee/docker/Dockerfile.dev
@@ -18,12 +18,13 @@ RUN pip install --upgrade pip \
 COPY ./ee /app/ee/
 COPY ./oss /app/oss/
 COPY ./entrypoint.py ./pyproject.toml /app/
+#
 
 RUN poetry config virtualenvs.create false \
     && poetry install --no-interaction --no-ansi
-    # && pip install -e /sdk/
+#
 
-# ENV PYTHONPATH=/sdk:$PYTHONPATH
+ENV PYTHONPATH=/sdk:$PYTHONPATH
 
 COPY ./ee/src/crons/meters.sh /meters.sh
 COPY ./ee/src/crons/meters.txt /etc/cron.d/meters-cron

diff --git a/api/ee/docker/Dockerfile.gh b/api/ee/docker/Dockerfile.gh
@@ -18,10 +18,11 @@ RUN pip install --upgrade pip \
 COPY ./ee /app/ee/
 COPY ./oss /app/oss/
 COPY ./entrypoint.py ./pyproject.toml /app/
+COPY ./sdk /sdk/
 
 RUN poetry config virtualenvs.create false \
-    && poetry install --no-interaction --no-ansi
-#
+    && poetry install --no-interaction --no-ansi \
+    && pip install --force-reinstall --upgrade /sdk/
 
 #
 

diff --git a/api/ee/src/models/db_models.py b/api/ee/src/models/db_models.py
@@ -304,7 +304,7 @@ class HumanEvaluationDB(Base):
         DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
     )
 
-    testset = relationship("TestSetDB")
+    testset = relationship("TestsetDB")
     evaluation_variant = relationship(
         "HumanEvaluationVariantDB",
         cascade=CASCADE_ALL_DELETE,
@@ -430,7 +430,7 @@ class EvaluationDB(Base):
     )
 
     project = relationship("ee.src.models.db_models.ProjectDB")
-    testset = relationship("TestSetDB")
+    testset = relationship("TestsetDB")
     variant = relationship("AppVariantDB")
     variant_revision = relationship("AppVariantRevisionsDB")
     aggregated_results = relationship(

diff --git a/api/ee/src/models/extended/deprecated_transfer_models.py b/api/ee/src/models/extended/deprecated_transfer_models.py
@@ -325,7 +325,7 @@ class HumanEvaluationScenarioDB(DeprecatedBase):
     )
 
 
-class TestSetDB(DeprecatedBase):
+class TestsetDB(DeprecatedBase):
     __tablename__ = "testsets"
     __table_args__ = {"extend_existing": True}
 

diff --git a/api/ee/src/routers/evaluation_router.py b/api/ee/src/routers/evaluation_router.py
@@ -515,5 +515,5 @@ async def start_evaluation(
 
         raise HTTPException(
             status_code=400,
-            detail="Columns in the test set should match the names of the inputs in the variant",
+            detail="Columns in the testset should match the names of the inputs in the variant",
         ) from e
diff --git a/api/ee/src/routers/human_evaluation_router.py b/api/ee/src/routers/human_evaluation_router.py
@@ -79,7 +79,7 @@ async def create_human_evaluation(
     except KeyError:
         raise HTTPException(
             status_code=400,
-            detail="columns in the test set should match the names of the inputs in the variant",
+            detail="columns in the testset should match the names of the inputs in the variant",
         )
 
 
@@ -209,7 +209,7 @@ async def update_human_evaluation(
     """Updates an evaluation's status.
 
     Raises:
-        HTTPException: If the columns in the test set do not match with the inputs in the variant.
+        HTTPException: If the columns in the testset do not match with the inputs in the variant.
 
     Returns:
         None: A 204 No Content status code, indicating that the update was successful.
@@ -241,7 +241,7 @@ async def update_human_evaluation(
     except KeyError:
         raise HTTPException(
             status_code=400,
-            detail="columns in the test set should match the names of the inputs in the variant",
+            detail="columns in the testset should match the names of the inputs in the variant",
         )
 
 

diff --git a/api/ee/src/services/db_manager_ee.py b/api/ee/src/services/db_manager_ee.py
@@ -48,7 +48,7 @@
     AppVariantDB,
     UserDB,
     AppDB,
-    TestSetDB,
+    TestsetDB,
     InvitationDB,
     EvaluatorConfigDB,
     AppVariantRevisionsDB,
@@ -1418,7 +1418,7 @@ async def fetch_evaluation_by_id(
             id=uuid.UUID(evaluation_id),
         )
         query = base_query.options(
-            joinedload(EvaluationDB.testset.of_type(TestSetDB)).load_only(TestSetDB.id, TestSetDB.name),  # type: ignore
+            joinedload(EvaluationDB.testset.of_type(TestsetDB)).load_only(TestsetDB.id, TestsetDB.name),  # type: ignore
         )
 
         result = await session.execute(
@@ -1451,7 +1451,7 @@ async def list_human_evaluations(app_id: str, project_id: str):
             .filter(HumanEvaluationDB.testset_id.isnot(None))
         )
         query = base_query.options(
-            joinedload(HumanEvaluationDB.testset.of_type(TestSetDB)).load_only(TestSetDB.id, TestSetDB.name),  # type: ignore
+            joinedload(HumanEvaluationDB.testset.of_type(TestsetDB)).load_only(TestsetDB.id, TestsetDB.name),  # type: ignore
         )
 
         result = await session.execute(query)
@@ -1583,7 +1583,7 @@ async def fetch_human_evaluation_by_id(
     async with engine.core_session() as session:
         base_query = select(HumanEvaluationDB).filter_by(id=uuid.UUID(evaluation_id))
         query = base_query.options(
-            joinedload(HumanEvaluationDB.testset.of_type(TestSetDB)).load_only(TestSetDB.id, TestSetDB.name),  # type: ignore
+            joinedload(HumanEvaluationDB.testset.of_type(TestsetDB)).load_only(TestsetDB.id, TestsetDB.name),  # type: ignore
         )
         result = await session.execute(query)
         evaluation = result.scalars().first()
@@ -1811,7 +1811,7 @@ async def fetch_human_evaluation_scenario_by_evaluation_id(
 async def create_new_evaluation(
     app: AppDB,
     project_id: str,
-    testset: TestSetDB,
+    testset: TestsetDB,
     status: Result,
     variant: str,
     variant_revision: str,
@@ -1859,7 +1859,7 @@ async def list_evaluations(app_id: str, project_id: str):
             app_id=uuid.UUID(app_id), project_id=uuid.UUID(project_id)
         )
         query = base_query.options(
-            joinedload(EvaluationDB.testset.of_type(TestSetDB)).load_only(TestSetDB.id, TestSetDB.name),  # type: ignore
+            joinedload(EvaluationDB.testset.of_type(TestsetDB)).load_only(TestsetDB.id, TestsetDB.name),  # type: ignore
         )
 
         result = await session.execute(

diff --git a/api/ee/src/services/evaluation_service.py b/api/ee/src/services/evaluation_service.py
@@ -59,7 +59,7 @@ async def prepare_csvdata_and_create_evaluation_scenario(
     """
 
     for datum in csvdata:
-        # Check whether the inputs in the test set match the inputs in the variant
+        # Check whether the inputs in the testset match the inputs in the variant
         try:
             inputs = [
                 {"input_name": name, "input_value": datum[name]}
@@ -70,9 +70,9 @@ async def prepare_csvdata_and_create_evaluation_scenario(
                 evaluation_id=str(new_evaluation.id)
             )
             msg = f"""
-            Columns in the test set should match the names of the inputs in the variant.
+            Columns in the testset should match the names of the inputs in the variant.
             Inputs names in variant are: {[variant_input for variant_input in payload_inputs]} while
-            columns in test set are: {[col for col in datum.keys() if col != 'correct_answer']}
+            columns in testset are: {[col for col in datum.keys() if col != 'correct_answer']}
             """
             raise HTTPException(
                 status_code=400,
@@ -396,7 +396,11 @@ async def create_new_evaluation(
     """
 
     app = await db_manager.fetch_app_by_id(app_id=app_id)
-    testset = await db_manager.fetch_testset_by_id(testset_id=testset_id)
+    testset = await db_manager.fetch_testset_by_id(
+        project_id=project_id,
+        #
+        testset_id=testset_id,
+    )
     variant_revision = await db_manager.fetch_app_variant_revision_by_id(
         variant_revision_id=revision_id
     )
@@ -405,6 +409,8 @@ async def create_new_evaluation(
         variant_revision and variant_revision.revision is not None
     ), f"Variant revision with {revision_id} cannot be None"
 
+    assert testset is not None, f"Testset with id {testset_id} does not exist"
+
     evaluation_db = await db_manager_ee.create_new_evaluation(
         app=app,
         project_id=project_id,

diff --git a/api/ee/src/services/llm_apps_service.py b/api/ee/src/services/llm_apps_service.py
@@ -202,6 +202,7 @@ async def invoke_app(
     openapi_parameters: List[Dict],
     user_id: str,
     project_id: str,
+    scenario_id: Optional[str] = None,
     **kwargs,
 ) -> InvokationResult:
     """
@@ -247,7 +248,14 @@ async def invoke_app(
         app_response = {}
 
         try:
-            log.info("Invoking workflow...", url=url)
+            log.info(
+                "Invoking application...",
+                scenario_id=scenario_id,
+                testcase_id=(
+                    datapoint["testcase_id"] if "testcase_id" in datapoint else None
+                ),
+                url=url,
+            )
             response = await client.post(
                 url,
                 json=payload,
@@ -268,6 +276,12 @@ async def invoke_app(
             trace_id = app_response.get("trace_id", None)
             span_id = app_response.get("span_id", None)
 
+            log.info(
+                "Invoked application.   ",
+                scenario_id=scenario_id,
+                trace_id=trace_id,
+            )
+
             return InvokationResult(
                 result=Result(
                     type=kind,
@@ -328,6 +342,7 @@ async def run_with_retry(
     openapi_parameters: List[Dict],
     user_id: str,
     project_id: str,
+    scenario_id: Optional[str] = None,
     **kwargs,
 ) -> InvokationResult:
     """
@@ -364,6 +379,7 @@ async def run_with_retry(
                 openapi_parameters,
                 user_id,
                 project_id,
+                scenario_id,
                 **kwargs,
             )
             return result
@@ -403,6 +419,7 @@ async def batch_invoke(
     rate_limit_config: Dict,
     user_id: str,
     project_id: str,
+    scenarios: Optional[List[Dict]] = None,
     **kwargs,
 ) -> List[InvokationResult]:
     """
@@ -497,6 +514,7 @@ async def batch_invoke(
                     openapi_parameters,
                     user_id,
                     project_id,
+                    scenarios[index].get("id") if scenarios else None,
                     **kwargs,
                 )
             )

diff --git a/api/ee/src/tasks/evaluations/batch.py b/api/ee/src/tasks/evaluations/batch.py
@@ -95,7 +95,8 @@
     TracingQuery,
 )
 from oss.src.core.workflows.dtos import (
-    WorkflowServiceData,
+    WorkflowServiceRequestData,
+    WorkflowServiceResponseData,
     WorkflowServiceRequest,
     WorkflowServiceResponse,
     WorkflowServiceInterface,