1+ import gzip
12import json
23import os
34from collections import defaultdict , deque
89from uuid import uuid4
910
1011import requests
11- from pydantic import BaseModel , Field
12+ from pydantic import BaseModel , Field , RootModel
1213from requests .exceptions import ConnectionError , MissingSchema
1314
1415from intelligence_layer .connectors import JsonSerializable
2425
2526Input = TypeVar ("Input" , bound = PydanticSerializable )
2627ExpectedOutput = TypeVar ("ExpectedOutput" , bound = PydanticSerializable )
28+ Output = TypeVar ("Output" , bound = PydanticSerializable )
29+ Evaluation = TypeVar ("Evaluation" , bound = BaseModel , covariant = True )
2730
2831
2932class StudioProject (BaseModel ):
@@ -140,6 +143,38 @@ class GetDatasetExamplesResponse(BaseModel, Generic[Input, ExpectedOutput]):
140143 items : Sequence [StudioExample [Input , ExpectedOutput ]]
141144
142145
146+ class BenchmarkLineage (BaseModel , Generic [Input , Output , ExpectedOutput , Evaluation ]):
147+ trace_id : str
148+ input : Input
149+ expected_output : ExpectedOutput
150+ output : Output
151+ example_metadata : Optional [dict [str , Any ]] = None
152+ evaluation : Any
153+ run_latency : int
154+ run_tokens : int
155+
156+
157+ class PostBenchmarkLineagesRequest (RootModel [Sequence [BenchmarkLineage ]]):
158+ pass
159+
160+
161+ class PostBenchmarkLineagesResponse (RootModel [Sequence [str ]]):
162+ pass
163+
164+
165+ class GetBenchmarkLineageResponse (BaseModel ):
166+ id : str
167+ trace_id : str
168+ benchmark_execution_id : str
169+ input : Any
170+ expected_output : Any
171+ example_metadata : Optional [dict [str , Any ]] = None
172+ output : Any
173+ evaluation : Any
174+ run_latency : int
175+ run_tokens : int
176+
177+
143178class StudioClient :
144179 """Client for communicating with Studio.
145180
@@ -403,7 +438,7 @@ def get_dataset_examples(
403438 if page is None :
404439 break
405440
406- def create_benchmark (
441+ def submit_benchmark (
407442 self ,
408443 dataset_id : str ,
409444 eval_logic : EvaluationLogicIdentifier ,
@@ -449,7 +484,7 @@ def get_benchmark(
449484 return None
450485 return GetBenchmarkResponse .model_validate (response_text )
451486
452- def create_benchmark_execution (
487+ def submit_benchmark_execution (
453488 self , benchmark_id : str , data : PostBenchmarkExecution
454489 ) -> str :
455490 url = urljoin (
@@ -464,6 +499,98 @@ def create_benchmark_execution(
464499 self ._raise_for_status (response )
465500 return str (response .json ())
466501
502+ def submit_benchmark_lineages (
503+ self ,
504+ benchmark_lineages : Sequence [BenchmarkLineage ],
505+ benchmark_id : str ,
506+ execution_id : str ,
507+ max_payload_size : int = 50
508+ * 1024
509+ * 1024 , # Maximum request size handled by Studio
510+ ) -> PostBenchmarkLineagesResponse :
511+ """Submit benchmark lineages in batches to avoid exceeding the maximum payload size.
512+
513+ Args:
514+ benchmark_lineages: List of :class: `BenchmarkLineages` to submit.
515+ benchmark_id: ID of the benchmark.
516+ execution_id: ID of the execution.
517+ max_payload_size: Maximum size of the payload in bytes. Defaults to 50MB.
518+
519+ Returns:
520+ Response containing the results of the submissions.
521+ """
522+ all_responses = []
523+ remaining_lineages = list (benchmark_lineages )
524+ lineage_sizes = [
525+ len (lineage .model_dump_json ().encode ("utf-8" ))
526+ for lineage in benchmark_lineages
527+ ]
528+
529+ while remaining_lineages :
530+ batch = []
531+ current_size = 0
532+ # Build batch while checking size
533+ for lineage , size in zip (remaining_lineages , lineage_sizes , strict = True ):
534+ if current_size + size <= max_payload_size :
535+ batch .append (lineage )
536+ current_size += size
537+ else :
538+ break
539+
540+ if batch :
541+ # Send batch
542+ response = self ._send_compressed_batch (
543+ batch , benchmark_id , execution_id
544+ )
545+ all_responses .extend (response )
546+
547+ else : # Only reached if a lineage is too big for the request
548+ print ("Lineage exceeds maximum of upload size" , lineage )
549+ batch .append (lineage )
550+ remaining_lineages = remaining_lineages [len (batch ) :]
551+ lineage_sizes = lineage_sizes [len (batch ) :]
552+
553+ return PostBenchmarkLineagesResponse (all_responses )
554+
555+ def get_benchmark_lineage (
556+ self , benchmark_id : str , execution_id : str , lineage_id : str
557+ ) -> GetBenchmarkLineageResponse | None :
558+ url = urljoin (
559+ self .url ,
560+ f"/api/projects/{ self .project_id } /evaluation/benchmarks/{ benchmark_id } /executions/{ execution_id } /lineages/{ lineage_id } " ,
561+ )
562+ response = requests .get (
563+ url ,
564+ headers = self ._headers ,
565+ )
566+ self ._raise_for_status (response )
567+ response_text = response .json ()
568+ if response_text is None :
569+ return None
570+ return GetBenchmarkLineageResponse .model_validate (response_text )
571+
572+ def _send_compressed_batch (
573+ self , batch : list [BenchmarkLineage ], benchmark_id : str , execution_id : str
574+ ) -> list [str ]:
575+ url = urljoin (
576+ self .url ,
577+ f"/api/projects/{ self .project_id } /evaluation/benchmarks/{ benchmark_id } /executions/{ execution_id } /lineages" ,
578+ )
579+
580+ json_data = PostBenchmarkLineagesRequest (root = batch ).model_dump_json ()
581+ compressed_data = gzip .compress (json_data .encode ("utf-8" ))
582+
583+ headers = {** self ._headers , "Content-Encoding" : "gzip" }
584+
585+ response = requests .post (
586+ url ,
587+ headers = headers ,
588+ data = compressed_data ,
589+ )
590+
591+ self ._raise_for_status (response )
592+ return response .json ()
593+
467594 def _raise_for_status (self , response : requests .Response ) -> None :
468595 try :
469596 response .raise_for_status ()
0 commit comments