mlcommons
diff --git a/‎examples/01_LocalBenchmark/run_tinyllm.py‎
Lines changed: 8 additions & 3 deletions b/‎examples/01_LocalBenchmark/run_tinyllm.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/inference_endpoint/async_utils/transport/protocol.py‎
Lines changed: 6 additions & 8 deletions b/‎src/inference_endpoint/async_utils/transport/protocol.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎src/inference_endpoint/async_utils/transport/zmq/pubsub.py‎
Lines changed: 1 addition & 1 deletion b/‎src/inference_endpoint/async_utils/transport/zmq/pubsub.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎…endpoint/async_utils/transport/record.py‎ ‎src/inference_endpoint/core/record.py‎src/inference_endpoint/async_utils/transport/record.py renamed to src/inference_endpoint/core/record.py
Lines changed: 5 additions & 2 deletions b/‎…endpoint/async_utils/transport/record.py‎ ‎src/inference_endpoint/core/record.py‎src/inference_endpoint/async_utils/transport/record.py renamed to src/inference_endpoint/core/record.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/inference_endpoint/core/types.py‎
Lines changed: 115 additions & 25 deletions b/‎src/inference_endpoint/core/types.py‎
Lines changed: 115 additions & 25 deletions
diff --git a/‎src/inference_endpoint/endpoint_client/config.py‎
Lines changed: 1 addition & 1 deletion b/‎src/inference_endpoint/endpoint_client/config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/inference_endpoint/endpoint_client/http_client.py‎
Lines changed: 6 additions & 1 deletion b/‎src/inference_endpoint/endpoint_client/http_client.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/inference_endpoint/endpoint_client/worker.py‎
Lines changed: 9 additions & 3 deletions b/‎src/inference_endpoint/endpoint_client/worker.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎src/inference_endpoint/load_generator/sample.py‎
Lines changed: 7 additions & 2 deletions b/‎src/inference_endpoint/load_generator/sample.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/inference_endpoint/metrics/recorder.py‎
Lines changed: 3 additions & 0 deletions b/‎src/inference_endpoint/metrics/recorder.py‎
Lines changed: 3 additions & 0 deletions
@@ -21,7 +21,7 @@
 import inference_endpoint.config.rulesets.mlcommons.models as mlcommons_models
 from inference_endpoint.config.rulesets.mlcommons.rules import CURRENT
 from inference_endpoint.config.user_config import UserConfig
-from inference_endpoint.core.types import QueryResult, StreamChunk
+from inference_endpoint.core.types import QueryResult, StreamChunk, TextModelOutput
 from inference_endpoint.dataset_manager.dataset import Dataset
 from inference_endpoint.load_generator import (
     BenchmarkSession,
@@ -167,10 +167,15 @@ def issue(self, sample):
                 )
                 SampleEventHandler.stream_chunk_complete(stream_chunk)
                 first = False
-            query_result = QueryResult(id=sample.uuid, response_output=chunks)
+            query_result = QueryResult(
+                id=sample.uuid,
+                response_output=TextModelOutput(output=chunks, reasoning=None),
+            )
         else:
             response = self.compute_func(sample.data)
-            query_result = QueryResult(id=sample.uuid, response_output=response)
+            query_result = QueryResult(
+                id=sample.uuid, response_output=TextModelOutput(output=response)
+            )
         SampleEventHandler.query_result_complete(query_result)
 
 
 
@@ -29,13 +29,13 @@
 
 import msgspec
 
-from inference_endpoint.async_utils.transport.record import (
+from inference_endpoint.core.record import (
     ErrorEventType,
     EventRecord,
     decode_event_record,
     encode_event_record,
 )
-from inference_endpoint.core.types import Query, QueryResult, StreamChunk
+from inference_endpoint.core.types import ErrorData, Query, QueryResult, StreamChunk
 
 if TYPE_CHECKING:
     from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext
@@ -341,14 +341,12 @@ def _on_readable(self) -> None:
                 try:
                     event_record = decode_event_record(payload)
                 except msgspec.DecodeError as e:
-                    # Record an error instead
-                    # TODO: Make `data` field more rigidly typed
                     event_record = EventRecord(
                         event_type=ErrorEventType.GENERIC,
-                        data={
-                            "error_type": "msgspec.DecodeError",
-                            "error_message": str(e),
-                        },
+                        data=ErrorData(
+                            error_type="msgspec.DecodeError",
+                            error_message=str(e),
+                        ),
                     )
                 records.append(event_record)
         except StopIteration:
 
@@ -24,7 +24,7 @@
     EventRecordPublisher,
     EventRecordSubscriber,
 )
-from inference_endpoint.async_utils.transport.record import TOPIC_FRAME_SIZE
+from inference_endpoint.core.record import TOPIC_FRAME_SIZE
 
 from .context import ManagedZMQContext
 
 
@@ -19,6 +19,8 @@
 
 import msgspec
 
+from .types import OUTPUT_TYPE, ErrorData, PromptData
+
 TOPIC_FRAME_SIZE: Final[int] = 40
 """int: Fixed bytesize for the encoded topic string. PUB messages will be prefixed by a
 topic string corresponding to the EventType. This topic will be null-padded to this fixed
@@ -120,6 +122,7 @@ class SessionEventType(EventType):
     STARTED = "started"
     ENDED = "ended"
     STOP_LOADGEN = "stop_loadgen"
+    START_PERFORMANCE_TRACKING = "start_performance_tracking"
     STOP_PERFORMANCE_TRACKING = "stop_performance_tracking"
 
 
@@ -145,13 +148,13 @@ class SampleEventType(EventType):
     TRANSPORT_RECV = "transport_recv"
 
 
-class EventRecord(msgspec.Struct, kw_only=True):  # type: ignore[call-arg]
+class EventRecord(msgspec.Struct, kw_only=True, frozen=True, gc=False):  # type: ignore[call-arg]
     """A record of an event that occurs throughout the inference process."""
 
     event_type: EventType
     timestamp_ns: int = msgspec.field(default_factory=time.monotonic_ns)
     sample_uuid: str = ""
-    data: dict[str, Any] = msgspec.field(default_factory=dict)
+    data: OUTPUT_TYPE | PromptData | ErrorData | None = None
 
 
 _ENCODER = msgspec.msgpack.Encoder(enc_hook=EventType.encode_hook)
 
@@ -48,8 +48,110 @@ class QueryStatus(Enum):
     CANCELLED = "cancelled"
 
 
-_OUTPUT_DICT_TYPE = dict[str, str | list[str]]
-_OUTPUT_RESULT_TYPE = str | tuple[str, ...] | _OUTPUT_DICT_TYPE | None
+OUTPUT_ELEM_TYPE = str | tuple[str, ...]
+"""Type for a single output or reasoning value: string (non-streaming) or tuple of strings (streaming)."""
+
+
+class TextModelOutput(
+    msgspec.Struct,
+    tag=True,
+    kw_only=True,
+    frozen=True,
+    omit_defaults=True,
+    array_like=True,
+    gc=False,
+):  # type: ignore[call-arg]
+    """Structured output from a text model.
+
+    Supports main output and optional reasoning (e.g. chain-of-thought).
+    Each field may be a string (non-streaming) or tuple of strings (streaming chunks).
+
+    Attributes:
+        output: Main model output. Defaults to empty string.
+        reasoning: Optional reasoning trace. Defaults to None.
+    """
+
+    output: OUTPUT_ELEM_TYPE = ""
+    reasoning: OUTPUT_ELEM_TYPE | None = None
+
+    def __post_init__(self):
+        """Convert list to tuple for output and reasoning to preserve immutability."""
+        if isinstance(self.output, list):
+            msgspec.structs.force_setattr(self, "output", tuple(self.output))
+        if self.reasoning is not None and isinstance(self.reasoning, list):
+            msgspec.structs.force_setattr(self, "reasoning", tuple(self.reasoning))
+
+    def __str__(self) -> str:
+        """Return the full output as a single string (joins tuple chunks if streaming)."""
+        parts = []
+        if self.reasoning:
+            if isinstance(self.reasoning, str):
+                parts.append(self.reasoning)
+            elif isinstance(self.reasoning, tuple):
+                parts.extend(self.reasoning)
+
+        if self.output:
+            if isinstance(self.output, str):
+                parts.append(self.output)
+            elif isinstance(self.output, tuple):
+                parts.extend(self.output)
+
+        return "".join(parts)
+
+
+OUTPUT_TYPE = TextModelOutput
+
+
+class PromptData(
+    msgspec.Struct,
+    tag=True,
+    kw_only=True,
+    frozen=True,
+    omit_defaults=True,
+    array_like=True,
+    gc=False,
+):  # type: ignore[call-arg]
+    """Prompt input data attached to ISSUED events for ISL computation.
+
+    Exactly one of ``text`` or ``token_ids`` should be set:
+    - ``text``: raw prompt string (OpenAI path) — requires tokenization for ISL.
+    - ``token_ids``: pre-tokenized token ID list (SGLang/Harmonize path) — ISL is len().
+
+    Attributes:
+        text: Raw prompt string. Set when the adapter sends text prompts.
+        token_ids: Pre-computed token IDs. Set when the adapter pre-tokenizes (e.g. SGLang).
+    """
+
+    text: str | None = None
+    token_ids: tuple[int, ...] | None = None
+
+
+class ErrorData(
+    msgspec.Struct,
+    tag=True,
+    kw_only=True,
+    frozen=True,
+    omit_defaults=True,
+    array_like=True,
+    gc=False,
+):  # type: ignore[call-arg]
+    """Structured error information.
+
+    Attributes:
+        error_type: Name of error. If possible, should be a qualified error type (e.g. "msgspec.DecodeError")..
+        error_message: Optional human-readable message. Defaults to empty string.
+    """
+
+    error_type: str
+    error_message: str = ""
+
+    def __str__(self) -> str:
+        """Human-readable string: 'type: message' if message present, else 'type'."""
+        return (
+            f"{self.error_type}: {self.error_message}"
+            if self.error_message
+            else self.error_type
+        )
 
 
 class Query(
@@ -98,6 +200,7 @@ class Query(
     created_at: float = msgspec.field(default_factory=time.time)
 
 
+# gc=False: audit 2026-03: metadata dict is only ever read, never mutated after construction.
 class QueryResult(
     msgspec.Struct,
     tag="query_result",
@@ -109,6 +212,10 @@ class QueryResult(
 ):  # type: ignore[call-arg]
     """Result of a completed inference query.
 
+    AT-RISK (gc=False): Has mutable container field `metadata`. Any change that
+    mutates `metadata` after construction or stores this struct in a container
+    referenced by this struct must be audited; if so, remove gc=False.
+
     Represents the outcome of processing a Query, including the response text,
     metadata, and any error information. The completed_at timestamp is
     automatically set to ensure accurate timing measurements.
@@ -118,14 +225,10 @@ class QueryResult(
 
     Attributes:
         id: Query identifier (matches the originating Query.id).
-        response_output: Generated text response from the endpoint (None if error).
-                         Can be a string, or a tuple of strings. If it is a string,
-                         it is assumed to be a non-streaming response. If it is a
-                         tuple of strings, it is assumed to be a streamed response,
-                         where the first element is the first chunk, which will not
-                         be included in the TPOT measurements.
+        response_output: Generated response from the endpoint (None if error).
+                         Prefer TextModelOutput; str is supported but will be deprecated.
         metadata: Additional response metadata (token counts, model info, etc.).
-        error: Error message if query failed (None if successful).
+        error: Structured error if query failed (None if successful).
         completed_at: High-resolution timestamp (nanoseconds, monotonic clock).
                       Auto-set in __post_init__ to prevent tampering.
 
@@ -144,9 +247,9 @@ class QueryResult(
     """
 
     id: str = ""
-    response_output: _OUTPUT_RESULT_TYPE = None
+    response_output: OUTPUT_TYPE | None = None
     metadata: dict[str, Any] = msgspec.field(default_factory=dict)
-    error: str | None = None
+    error: ErrorData | None = None
     completed_at: int | msgspec.UnsetType = msgspec.UNSET
 
     def __post_init__(self):
@@ -166,22 +269,9 @@ def __post_init__(self):
         # due to how monotonic_ns works.
         msgspec.structs.force_setattr(self, "completed_at", time.monotonic_ns())
 
-        # A list can be passed on, but we need to convert it to a tuple to maintain immutability,
-        # and for serialization to work properly.
-        if isinstance(self.response_output, list):
-            msgspec.structs.force_setattr(
-                self, "response_output", tuple(self.response_output)
-            )
-        elif isinstance(self.response_output, dict):
-            for k, v in self.response_output.items():
-                if isinstance(v, list):
-                    self.response_output[k] = tuple(v)
-
     def get_response_output_string(self) -> str:
         """Get the response output as a string."""
-        if isinstance(self.response_output, tuple):
-            return "".join(self.response_output)
-        elif isinstance(self.response_output, dict):
+        if isinstance(self.response_output, TextModelOutput):
             return str(self.response_output)
         elif isinstance(self.response_output, str):
             return self.response_output
 
@@ -71,7 +71,7 @@ class HTTPClientConfig:
     cpu_affinity: AffinityPlan | None = None
 
     # Worker lifecycle timeouts
-    worker_initialization_timeout: float = 40.0  # init
+    worker_initialization_timeout: float = 60.0  # init
     worker_graceful_shutdown_wait: float = 0.5  # post-run
     worker_force_kill_timeout: float = 0.5  # post-run
 
 
@@ -115,12 +115,17 @@ def issue(self, query: Query) -> None:
         """
         Issue query to endpoint (round-robin to workers).
         Non-blocking - buffers if socket would block.
+
+        Thread-safe: schedules the send on the event loop thread via
+        call_soon_threadsafe, since the underlying ZMQ sockets and send
+        buffers are not thread-safe and belong to the event loop thread.
         """
         if self._shutdown:
             # NOTE(vir): drop requests during shutdown
             self._dropped_requests += 1
         else:
-            self.pool.send(next(self._worker_cycle), query)
+            worker_id = next(self._worker_cycle)
+            self.loop.call_soon_threadsafe(self.pool.send, worker_id, query)
 
     def poll(self) -> QueryResult | StreamChunk | None:
         """Non-blocking. Returns response if available, None otherwise."""
 
@@ -35,7 +35,7 @@
     WorkerConnector,
 )
 from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext
-from inference_endpoint.core.types import Query, QueryResult
+from inference_endpoint.core.types import ErrorData, Query, QueryResult
 from inference_endpoint.endpoint_client.adapter_protocol import HttpRequestAdapter
 from inference_endpoint.endpoint_client.config import HTTPClientConfig
 from inference_endpoint.endpoint_client.http import (
@@ -515,11 +515,17 @@ async def _handle_error(self, query_id: str, error: Exception | str) -> None:
         if self._shutdown or not self._responses:
             return
 
-        error_message = repr(error) if isinstance(error, Exception) else error
+        if isinstance(error, Exception):
+            error_data = ErrorData(
+                error_type=type(error).__name__,
+                error_message=repr(error),
+            )
+        else:
+            error_data = ErrorData(error_type="error", error_message=error)
         error_response = QueryResult(
             id=query_id,
             response_output=None,
-            error=error_message,
+            error=error_data,
         )
         self._responses.send(error_response)
         if self.http_config.record_worker_events:
 
@@ -85,6 +85,10 @@ class _SampleEventHandler:
     A valid hook is a callable that takes a single argument, representing the response object (StreamChunk or QueryResult).
 
     A simple example use-case of a hook is to update a progress bar on-completion of a sample.
+
+    NOTE: Hook lists are not thread-safe. Hooks must be registered before the benchmark
+    starts (single-threaded setup phase). This is a known limitation; _SampleEventHandler
+    is being deprecated in favor of the pub-sub EventLoggerService.
     """
 
     __slots__ = ["first_chunk_hooks", "non_first_chunk_hooks", "complete_hooks"]
@@ -180,9 +184,10 @@ def query_result_complete(self, result: QueryResult) -> None:
 
         # Even if there is an error, we still record the event to count the sample as complete
         if result.error is not None:
-            logger.error(f"Error in request {result.id}: {result.error}")
+            err_str = str(result.error)
+            logger.error(f"Error in request {result.id}: {err_str}")
 
-            record_exception(result.error, result.id)
+            record_exception(err_str, result.id)
 
         EventRecorder.record_event(
             SampleEvent.COMPLETE,
 
@@ -372,6 +372,9 @@ def record_event(
             )
 
         # Update inflight sample tracking
+        # NOTE: n_inflight_samples is not thread-safe (+=/-= from multiple threads).
+        # This is a known issue but EventRecorder is being deprecated in favor of
+        # EventLoggerService (pub-sub based). Not worth fixing here.
         if ev_type == SessionEvent.LOADGEN_ISSUE_CALLED:
             rec_inst.n_inflight_samples += 1
         elif ev_type == SampleEvent.COMPLETE:
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@`
`24`	`24`	`EventRecordPublisher,`
`25`	`25`	`EventRecordSubscriber,`
`26`	`26`	`)`
`27`		`-from inference_endpoint.async_utils.transport.record import TOPIC_FRAME_SIZE`
	`27`	`+from inference_endpoint.core.record import TOPIC_FRAME_SIZE`
`28`	`28`
`29`	`29`	`from .context import ManagedZMQContext`
`30`	`30`
Original file line number	Diff line number	Diff line change
`@@ -372,6 +372,9 @@ def record_event(`
`372`	`372`	`)`
`373`	`373`
`374`	`374`	`# Update inflight sample tracking`
	`375`	`+ # NOTE: n_inflight_samples is not thread-safe (+=/-= from multiple threads).`
	`376`	`+ # This is a known issue but EventRecorder is being deprecated in favor of`
	`377`	`+ # EventLoggerService (pub-sub based). Not worth fixing here.`
`375`	`378`	`if ev_type == SessionEvent.LOADGEN_ISSUE_CALLED:`
`376`	`379`	`rec_inst.n_inflight_samples += 1`
`377`	`380`	`elif ev_type == SampleEvent.COMPLETE:`