Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ Collaborate with Subject Matter Experts (SMEs) on prompt engineering and make su

### 📊 Evaluation & Testing
Evaluate your LLM applications systematically with both human and automated feedback.
- **Flexible Test Sets**: Create test cases from production data, playground experiments, or upload CSVs
- **Flexible Testsets**: Create testcases from production data, playground experiments, or upload CSVs
- **Pre-built and Custom Evaluators**: Use LLM-as-judge, one of our 20+ pre-built evaluators, or you custom evaluators
- **UI and API Access**: Run evaluations via UI (for SMEs) or programmatically (for engineers)
- **Human Feedback Integration**: Collect and incorporate expert annotations
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
EvaluatorConfigDB,
HumanEvaluationDB,
HumanEvaluationScenarioDB,
TestSetDB,
TestsetDB,
)


Expand All @@ -38,7 +38,7 @@
EvaluatorConfigDB, # have workspace_id
HumanEvaluationDB, # have workspace_id
HumanEvaluationScenarioDB, # have workspace_id
TestSetDB, # have workspace_id
TestsetDB, # have workspace_id
]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)
from oss.src.dbs.postgres.git.dao import GitDAO
from oss.src.core.testcases.service import TestcasesService
from oss.src.models.deprecated_models import DeprecatedTestSetDB
from oss.src.models.deprecated_models import DeprecatedTestsetDB
from oss.src.core.testsets.service import TestsetsService, SimpleTestsetsService


Expand Down Expand Up @@ -81,8 +81,8 @@ async def migration_old_testsets_to_new_testsets(
# Count total rows with a non-null project_id
total_query = (
select(func.count())
.select_from(DeprecatedTestSetDB)
.filter(DeprecatedTestSetDB.project_id.isnot(None))
.select_from(DeprecatedTestsetDB)
.filter(DeprecatedTestsetDB.project_id.isnot(None))
)
result = await connection.execute(total_query)
total_rows = result.scalar()
Expand All @@ -98,8 +98,8 @@ async def migration_old_testsets_to_new_testsets(
while offset < total_testsets:
# STEP 1: Fetch evaluator configurations with non-null project_id
result = await connection.execute(
select(DeprecatedTestSetDB)
.filter(DeprecatedTestSetDB.project_id.isnot(None))
select(DeprecatedTestsetDB)
.filter(DeprecatedTestsetDB.project_id.isnot(None))
.offset(offset)
.limit(DEFAULT_BATCH_SIZE)
)
Expand Down
5 changes: 3 additions & 2 deletions api/ee/docker/Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@ RUN pip install --upgrade pip \
COPY ./ee /app/ee/
COPY ./oss /app/oss/
COPY ./entrypoint.py ./pyproject.toml /app/
#

RUN poetry config virtualenvs.create false \
&& poetry install --no-interaction --no-ansi
# && pip install -e /sdk/
#

# ENV PYTHONPATH=/sdk:$PYTHONPATH
ENV PYTHONPATH=/sdk:$PYTHONPATH

COPY ./ee/src/crons/meters.sh /meters.sh
COPY ./ee/src/crons/meters.txt /etc/cron.d/meters-cron
Expand Down
5 changes: 3 additions & 2 deletions api/ee/docker/Dockerfile.gh
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ RUN pip install --upgrade pip \
COPY ./ee /app/ee/
COPY ./oss /app/oss/
COPY ./entrypoint.py ./pyproject.toml /app/
COPY ./sdk /sdk/

RUN poetry config virtualenvs.create false \
&& poetry install --no-interaction --no-ansi
#
&& poetry install --no-interaction --no-ansi \
&& pip install --force-reinstall --upgrade /sdk/

#

Expand Down
4 changes: 2 additions & 2 deletions api/ee/src/models/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ class HumanEvaluationDB(Base):
DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
)

testset = relationship("TestSetDB")
testset = relationship("TestsetDB")
evaluation_variant = relationship(
"HumanEvaluationVariantDB",
cascade=CASCADE_ALL_DELETE,
Expand Down Expand Up @@ -430,7 +430,7 @@ class EvaluationDB(Base):
)

project = relationship("ee.src.models.db_models.ProjectDB")
testset = relationship("TestSetDB")
testset = relationship("TestsetDB")
variant = relationship("AppVariantDB")
variant_revision = relationship("AppVariantRevisionsDB")
aggregated_results = relationship(
Expand Down
2 changes: 1 addition & 1 deletion api/ee/src/models/extended/deprecated_transfer_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ class HumanEvaluationScenarioDB(DeprecatedBase):
)


class TestSetDB(DeprecatedBase):
class TestsetDB(DeprecatedBase):
__tablename__ = "testsets"
__table_args__ = {"extend_existing": True}

Expand Down
2 changes: 1 addition & 1 deletion api/ee/src/routers/evaluation_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,5 +515,5 @@ async def start_evaluation(

raise HTTPException(
status_code=400,
detail="Columns in the test set should match the names of the inputs in the variant",
detail="Columns in the testset should match the names of the inputs in the variant",
) from e
6 changes: 3 additions & 3 deletions api/ee/src/routers/human_evaluation_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ async def create_human_evaluation(
except KeyError:
raise HTTPException(
status_code=400,
detail="columns in the test set should match the names of the inputs in the variant",
detail="columns in the testset should match the names of the inputs in the variant",
)


Expand Down Expand Up @@ -209,7 +209,7 @@ async def update_human_evaluation(
"""Updates an evaluation's status.

Raises:
HTTPException: If the columns in the test set do not match with the inputs in the variant.
HTTPException: If the columns in the testset do not match with the inputs in the variant.

Returns:
None: A 204 No Content status code, indicating that the update was successful.
Expand Down Expand Up @@ -241,7 +241,7 @@ async def update_human_evaluation(
except KeyError:
raise HTTPException(
status_code=400,
detail="columns in the test set should match the names of the inputs in the variant",
detail="columns in the testset should match the names of the inputs in the variant",
)


Expand Down
12 changes: 6 additions & 6 deletions api/ee/src/services/db_manager_ee.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
AppVariantDB,
UserDB,
AppDB,
TestSetDB,
TestsetDB,
InvitationDB,
EvaluatorConfigDB,
AppVariantRevisionsDB,
Expand Down Expand Up @@ -1418,7 +1418,7 @@ async def fetch_evaluation_by_id(
id=uuid.UUID(evaluation_id),
)
query = base_query.options(
joinedload(EvaluationDB.testset.of_type(TestSetDB)).load_only(TestSetDB.id, TestSetDB.name), # type: ignore
joinedload(EvaluationDB.testset.of_type(TestsetDB)).load_only(TestsetDB.id, TestsetDB.name), # type: ignore
)

result = await session.execute(
Expand Down Expand Up @@ -1451,7 +1451,7 @@ async def list_human_evaluations(app_id: str, project_id: str):
.filter(HumanEvaluationDB.testset_id.isnot(None))
)
query = base_query.options(
joinedload(HumanEvaluationDB.testset.of_type(TestSetDB)).load_only(TestSetDB.id, TestSetDB.name), # type: ignore
joinedload(HumanEvaluationDB.testset.of_type(TestsetDB)).load_only(TestsetDB.id, TestsetDB.name), # type: ignore
)

result = await session.execute(query)
Expand Down Expand Up @@ -1583,7 +1583,7 @@ async def fetch_human_evaluation_by_id(
async with engine.core_session() as session:
base_query = select(HumanEvaluationDB).filter_by(id=uuid.UUID(evaluation_id))
query = base_query.options(
joinedload(HumanEvaluationDB.testset.of_type(TestSetDB)).load_only(TestSetDB.id, TestSetDB.name), # type: ignore
joinedload(HumanEvaluationDB.testset.of_type(TestsetDB)).load_only(TestsetDB.id, TestsetDB.name), # type: ignore
)
result = await session.execute(query)
evaluation = result.scalars().first()
Expand Down Expand Up @@ -1811,7 +1811,7 @@ async def fetch_human_evaluation_scenario_by_evaluation_id(
async def create_new_evaluation(
app: AppDB,
project_id: str,
testset: TestSetDB,
testset: TestsetDB,
status: Result,
variant: str,
variant_revision: str,
Expand Down Expand Up @@ -1859,7 +1859,7 @@ async def list_evaluations(app_id: str, project_id: str):
app_id=uuid.UUID(app_id), project_id=uuid.UUID(project_id)
)
query = base_query.options(
joinedload(EvaluationDB.testset.of_type(TestSetDB)).load_only(TestSetDB.id, TestSetDB.name), # type: ignore
joinedload(EvaluationDB.testset.of_type(TestsetDB)).load_only(TestsetDB.id, TestsetDB.name), # type: ignore
)

result = await session.execute(
Expand Down
14 changes: 10 additions & 4 deletions api/ee/src/services/evaluation_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ async def prepare_csvdata_and_create_evaluation_scenario(
"""

for datum in csvdata:
# Check whether the inputs in the test set match the inputs in the variant
# Check whether the inputs in the testset match the inputs in the variant
try:
inputs = [
{"input_name": name, "input_value": datum[name]}
Expand All @@ -70,9 +70,9 @@ async def prepare_csvdata_and_create_evaluation_scenario(
evaluation_id=str(new_evaluation.id)
)
msg = f"""
Columns in the test set should match the names of the inputs in the variant.
Columns in the testset should match the names of the inputs in the variant.
Inputs names in variant are: {[variant_input for variant_input in payload_inputs]} while
columns in test set are: {[col for col in datum.keys() if col != 'correct_answer']}
columns in testset are: {[col for col in datum.keys() if col != 'correct_answer']}
"""
raise HTTPException(
status_code=400,
Expand Down Expand Up @@ -396,7 +396,11 @@ async def create_new_evaluation(
"""

app = await db_manager.fetch_app_by_id(app_id=app_id)
testset = await db_manager.fetch_testset_by_id(testset_id=testset_id)
testset = await db_manager.fetch_testset_by_id(
project_id=project_id,
#
testset_id=testset_id,
)
variant_revision = await db_manager.fetch_app_variant_revision_by_id(
variant_revision_id=revision_id
)
Expand All @@ -405,6 +409,8 @@ async def create_new_evaluation(
variant_revision and variant_revision.revision is not None
), f"Variant revision with {revision_id} cannot be None"

assert testset is not None, f"Testset with id {testset_id} does not exist"

evaluation_db = await db_manager_ee.create_new_evaluation(
app=app,
project_id=project_id,
Expand Down
20 changes: 19 additions & 1 deletion api/ee/src/services/llm_apps_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ async def invoke_app(
openapi_parameters: List[Dict],
user_id: str,
project_id: str,
scenario_id: Optional[str] = None,
**kwargs,
) -> InvokationResult:
"""
Expand Down Expand Up @@ -247,7 +248,14 @@ async def invoke_app(
app_response = {}

try:
log.info("Invoking workflow...", url=url)
log.info(
"Invoking application...",
scenario_id=scenario_id,
testcase_id=(
datapoint["testcase_id"] if "testcase_id" in datapoint else None
),
url=url,
)
response = await client.post(
url,
json=payload,
Expand All @@ -268,6 +276,12 @@ async def invoke_app(
trace_id = app_response.get("trace_id", None)
span_id = app_response.get("span_id", None)

log.info(
"Invoked application. ",
scenario_id=scenario_id,
trace_id=trace_id,
)

return InvokationResult(
result=Result(
type=kind,
Expand Down Expand Up @@ -328,6 +342,7 @@ async def run_with_retry(
openapi_parameters: List[Dict],
user_id: str,
project_id: str,
scenario_id: Optional[str] = None,
**kwargs,
) -> InvokationResult:
"""
Expand Down Expand Up @@ -364,6 +379,7 @@ async def run_with_retry(
openapi_parameters,
user_id,
project_id,
scenario_id,
**kwargs,
)
return result
Expand Down Expand Up @@ -403,6 +419,7 @@ async def batch_invoke(
rate_limit_config: Dict,
user_id: str,
project_id: str,
scenarios: Optional[List[Dict]] = None,
**kwargs,
) -> List[InvokationResult]:
"""
Expand Down Expand Up @@ -497,6 +514,7 @@ async def batch_invoke(
openapi_parameters,
user_id,
project_id,
scenarios[index].get("id") if scenarios else None,
**kwargs,
)
)
Expand Down
3 changes: 2 additions & 1 deletion api/ee/src/tasks/evaluations/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@
TracingQuery,
)
from oss.src.core.workflows.dtos import (
WorkflowServiceData,
WorkflowServiceRequestData,
WorkflowServiceResponseData,
WorkflowServiceRequest,
WorkflowServiceResponse,
WorkflowServiceInterface,
Expand Down
Loading
Loading