Skip to content

Commit eab6ed1

Browse files
author
MerlinKallenbornAA
authored
feat: Upload benchmark lineages from SDK to studio (#1164)
* feat: Add skeleton for Benchmark Lineage Upload * feat: Adding tests * refactor: Type Handling of connector function * feat: Include batch upload of benchmark lineages * docs: Add Changelog entry * feat: Add get_benchmark_lineage method for better testing TASK: PHS-885
1 parent 170acbe commit eab6ed1

File tree

5 files changed

+370
-52
lines changed

5 files changed

+370
-52
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
- Introduce `BenchmarkRepository`and `StudioBenchmarkRepository`
1010
- Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects
1111
- Add progressbar to the `Runner` to be able to track the `Run`
12+
- Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution`
1213

1314
### Fixes
1415
...

src/intelligence_layer/connectors/studio/studio.py

Lines changed: 130 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import gzip
12
import json
23
import os
34
from collections import defaultdict, deque
@@ -8,7 +9,7 @@
89
from uuid import uuid4
910

1011
import requests
11-
from pydantic import BaseModel, Field
12+
from pydantic import BaseModel, Field, RootModel
1213
from requests.exceptions import ConnectionError, MissingSchema
1314

1415
from intelligence_layer.connectors import JsonSerializable
@@ -24,6 +25,8 @@
2425

2526
Input = TypeVar("Input", bound=PydanticSerializable)
2627
ExpectedOutput = TypeVar("ExpectedOutput", bound=PydanticSerializable)
28+
Output = TypeVar("Output", bound=PydanticSerializable)
29+
Evaluation = TypeVar("Evaluation", bound=BaseModel, covariant=True)
2730

2831

2932
class StudioProject(BaseModel):
@@ -140,6 +143,38 @@ class GetDatasetExamplesResponse(BaseModel, Generic[Input, ExpectedOutput]):
140143
items: Sequence[StudioExample[Input, ExpectedOutput]]
141144

142145

146+
class BenchmarkLineage(BaseModel, Generic[Input, Output, ExpectedOutput, Evaluation]):
147+
trace_id: str
148+
input: Input
149+
expected_output: ExpectedOutput
150+
output: Output
151+
example_metadata: Optional[dict[str, Any]] = None
152+
evaluation: Any
153+
run_latency: int
154+
run_tokens: int
155+
156+
157+
class PostBenchmarkLineagesRequest(RootModel[Sequence[BenchmarkLineage]]):
158+
pass
159+
160+
161+
class PostBenchmarkLineagesResponse(RootModel[Sequence[str]]):
162+
pass
163+
164+
165+
class GetBenchmarkLineageResponse(BaseModel):
166+
id: str
167+
trace_id: str
168+
benchmark_execution_id: str
169+
input: Any
170+
expected_output: Any
171+
example_metadata: Optional[dict[str, Any]] = None
172+
output: Any
173+
evaluation: Any
174+
run_latency: int
175+
run_tokens: int
176+
177+
143178
class StudioClient:
144179
"""Client for communicating with Studio.
145180
@@ -403,7 +438,7 @@ def get_dataset_examples(
403438
if page is None:
404439
break
405440

406-
def create_benchmark(
441+
def submit_benchmark(
407442
self,
408443
dataset_id: str,
409444
eval_logic: EvaluationLogicIdentifier,
@@ -449,7 +484,7 @@ def get_benchmark(
449484
return None
450485
return GetBenchmarkResponse.model_validate(response_text)
451486

452-
def create_benchmark_execution(
487+
def submit_benchmark_execution(
453488
self, benchmark_id: str, data: PostBenchmarkExecution
454489
) -> str:
455490
url = urljoin(
@@ -464,6 +499,98 @@ def create_benchmark_execution(
464499
self._raise_for_status(response)
465500
return str(response.json())
466501

502+
def submit_benchmark_lineages(
503+
self,
504+
benchmark_lineages: Sequence[BenchmarkLineage],
505+
benchmark_id: str,
506+
execution_id: str,
507+
max_payload_size: int = 50
508+
* 1024
509+
* 1024, # Maximum request size handled by Studio
510+
) -> PostBenchmarkLineagesResponse:
511+
"""Submit benchmark lineages in batches to avoid exceeding the maximum payload size.
512+
513+
Args:
514+
benchmark_lineages: List of :class: `BenchmarkLineages` to submit.
515+
benchmark_id: ID of the benchmark.
516+
execution_id: ID of the execution.
517+
max_payload_size: Maximum size of the payload in bytes. Defaults to 50MB.
518+
519+
Returns:
520+
Response containing the results of the submissions.
521+
"""
522+
all_responses = []
523+
remaining_lineages = list(benchmark_lineages)
524+
lineage_sizes = [
525+
len(lineage.model_dump_json().encode("utf-8"))
526+
for lineage in benchmark_lineages
527+
]
528+
529+
while remaining_lineages:
530+
batch = []
531+
current_size = 0
532+
# Build batch while checking size
533+
for lineage, size in zip(remaining_lineages, lineage_sizes, strict=True):
534+
if current_size + size <= max_payload_size:
535+
batch.append(lineage)
536+
current_size += size
537+
else:
538+
break
539+
540+
if batch:
541+
# Send batch
542+
response = self._send_compressed_batch(
543+
batch, benchmark_id, execution_id
544+
)
545+
all_responses.extend(response)
546+
547+
else: # Only reached if a lineage is too big for the request
548+
print("Lineage exceeds maximum of upload size", lineage)
549+
batch.append(lineage)
550+
remaining_lineages = remaining_lineages[len(batch) :]
551+
lineage_sizes = lineage_sizes[len(batch) :]
552+
553+
return PostBenchmarkLineagesResponse(all_responses)
554+
555+
def get_benchmark_lineage(
556+
self, benchmark_id: str, execution_id: str, lineage_id: str
557+
) -> GetBenchmarkLineageResponse | None:
558+
url = urljoin(
559+
self.url,
560+
f"/api/projects/{self.project_id}/evaluation/benchmarks/{benchmark_id}/executions/{execution_id}/lineages/{lineage_id}",
561+
)
562+
response = requests.get(
563+
url,
564+
headers=self._headers,
565+
)
566+
self._raise_for_status(response)
567+
response_text = response.json()
568+
if response_text is None:
569+
return None
570+
return GetBenchmarkLineageResponse.model_validate(response_text)
571+
572+
def _send_compressed_batch(
573+
self, batch: list[BenchmarkLineage], benchmark_id: str, execution_id: str
574+
) -> list[str]:
575+
url = urljoin(
576+
self.url,
577+
f"/api/projects/{self.project_id}/evaluation/benchmarks/{benchmark_id}/executions/{execution_id}/lineages",
578+
)
579+
580+
json_data = PostBenchmarkLineagesRequest(root=batch).model_dump_json()
581+
compressed_data = gzip.compress(json_data.encode("utf-8"))
582+
583+
headers = {**self._headers, "Content-Encoding": "gzip"}
584+
585+
response = requests.post(
586+
url,
587+
headers=headers,
588+
data=compressed_data,
589+
)
590+
591+
self._raise_for_status(response)
592+
return response.json()
593+
467594
def _raise_for_status(self, response: requests.Response) -> None:
468595
try:
469596
response.raise_for_status()

src/intelligence_layer/evaluation/benchmark/studio_benchmark.py

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import inspect
2+
from collections.abc import Sequence
23
from datetime import datetime
34
from http import HTTPStatus
45
from typing import Any, Optional
@@ -9,6 +10,7 @@
910

1011
from intelligence_layer.connectors.studio.studio import (
1112
AggregationLogicIdentifier,
13+
BenchmarkLineage,
1214
EvaluationLogicIdentifier,
1315
PostBenchmarkExecution,
1416
StudioClient,
@@ -39,6 +41,9 @@
3941
from intelligence_layer.evaluation.evaluation.in_memory_evaluation_repository import (
4042
InMemoryEvaluationRepository,
4143
)
44+
from intelligence_layer.evaluation.infrastructure.repository_navigator import (
45+
EvaluationLineage,
46+
)
4247
from intelligence_layer.evaluation.run.in_memory_run_repository import (
4348
InMemoryRunRepository,
4449
)
@@ -135,17 +140,60 @@ def execute(
135140
statistics=aggregation_overview.statistics.model_dump_json(),
136141
)
137142

138-
benchmark_execution_id = self.client.create_benchmark_execution(
143+
benchmark_execution_id = self.client.submit_benchmark_execution(
139144
benchmark_id=self.id, data=data
140145
)
141146

142-
evaluation_lineages = self.evaluator.evaluation_lineages(evaluation_overview.id)
147+
evaluation_lineages = list(
148+
self.evaluator.evaluation_lineages(evaluation_overview.id)
149+
)
150+
trace_ids = []
143151
for lineage in tqdm(evaluation_lineages, desc="Submitting traces to Studio"):
144152
trace = lineage.tracers[0]
145153
assert trace
146-
self.client.submit_trace(trace.export_for_viewing())
154+
trace_id = self.client.submit_trace(trace.export_for_viewing())
155+
trace_ids.append(trace_id)
156+
157+
benchmark_lineages = self._create_benchmark_lineages(
158+
eval_lineages=evaluation_lineages,
159+
trace_ids=trace_ids,
160+
)
161+
self.client.submit_benchmark_lineages(
162+
benchmark_lineages=benchmark_lineages,
163+
execution_id=benchmark_execution_id,
164+
benchmark_id=self.id,
165+
)
166+
147167
return benchmark_execution_id
148168

169+
def _create_benchmark_lineages(
170+
self,
171+
eval_lineages: list[
172+
EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]
173+
],
174+
trace_ids: list[str],
175+
) -> Sequence[BenchmarkLineage[Input, Output, ExpectedOutput, Evaluation]]:
176+
return [
177+
self._create_benchmark_lineage(eval_lineage, trace_id)
178+
for eval_lineage, trace_id in zip(eval_lineages, trace_ids, strict=True)
179+
]
180+
181+
def _create_benchmark_lineage(
182+
self,
183+
eval_lineage: EvaluationLineage[Input, ExpectedOutput, Output, Evaluation],
184+
trace_id: str,
185+
) -> BenchmarkLineage:
186+
return BenchmarkLineage(
187+
trace_id=trace_id,
188+
input=eval_lineage.example.input,
189+
expected_output=eval_lineage.example.expected_output,
190+
example_metadata=eval_lineage.example.metadata,
191+
output=eval_lineage.outputs[0].output,
192+
evaluation=eval_lineage.evaluation.result,
193+
run_latency=0, # TODO: Implement this
194+
run_tokens=0, # TODO: Implement this
195+
)
196+
149197

150198
class StudioBenchmarkRepository(BenchmarkRepository):
151199
def __init__(self, studio_client: StudioClient):
@@ -161,7 +209,7 @@ def create_benchmark(
161209
description: Optional[str] = None,
162210
) -> StudioBenchmark:
163211
try:
164-
benchmark_id = self.client.create_benchmark(
212+
benchmark_id = self.client.submit_benchmark(
165213
dataset_id,
166214
create_evaluation_logic_identifier(eval_logic),
167215
create_aggregation_logic_identifier(aggregation_logic),

0 commit comments

Comments
 (0)