mlcommons
diff --git a/‎.cursor/rules/msgspec-patterns.mdc‎
Lines changed: 534 additions & 0 deletions b/‎.cursor/rules/msgspec-patterns.mdc‎
Lines changed: 534 additions & 0 deletions
diff --git a/‎src/inference_endpoint/core/types.py‎
Lines changed: 54 additions & 3 deletions b/‎src/inference_endpoint/core/types.py‎
Lines changed: 54 additions & 3 deletions
diff --git a/‎src/inference_endpoint/openai/types.py‎
Lines changed: 58 additions & 11 deletions b/‎src/inference_endpoint/openai/types.py‎
Lines changed: 58 additions & 11 deletions
diff --git a/‎tests/performance/openai/__init__.py‎ b/‎tests/performance/openai/__init__.py‎
diff --git a/‎tests/performance/openai/test_adapter.py‎
Lines changed: 104 additions & 0 deletions b/‎tests/performance/openai/test_adapter.py‎
Lines changed: 104 additions & 0 deletions
@@ -52,7 +52,13 @@ class QueryStatus(Enum):
 _OUTPUT_RESULT_TYPE = str | tuple[str, ...] | _OUTPUT_DICT_TYPE | None
 
 
-class Query(msgspec.Struct, kw_only=True):
+class Query(
+    msgspec.Struct,
+    kw_only=True,
+    array_like=True,
+    omit_defaults=True,
+    gc=False
+):
     """Represents a single inference query to be sent to an endpoint.
 
     A Query encapsulates all information needed to make an HTTP request to
@@ -72,6 +78,17 @@ class Query(msgspec.Struct, kw_only=True):
         ...     data={"prompt": "Hello", "model": "Qwen/Qwen3-8B", "max_tokens": 100},
         ...     headers={"Authorization": "Bearer token123"},
         ... )
+
+    Note:
+        gc=False: Safe because data/headers are simple key-value pairs without cycles.
+        Do NOT store self-referential or cyclic structures in data/headers fields.
+
+        array_like=True: Encodes as array instead of object (e.g., ["id", {...}, {...}, 0.0]
+        instead of {"id": ..., "data": ..., ...}). Provides ~6-50% size reduction and
+        ~6-29% ser/des speedup for ZMQ transport depending on payload size.
+
+        omit_defaults=True: Fields with default values are omitted during encoding,
+        further reducing message size for queries with empty headers.
     """
 
     id: str = msgspec.field(default_factory=lambda: str(uuid.uuid4()))
@@ -80,7 +97,15 @@ class Query(msgspec.Struct, kw_only=True):
     created_at: float = msgspec.field(default_factory=time.time)
 
 
-class QueryResult(msgspec.Struct, tag="query_result", kw_only=True, frozen=True):
+class QueryResult(
+    msgspec.Struct,
+    tag="query_result",
+    kw_only=True,
+    frozen=True,
+    array_like=True,
+    omit_defaults=True,
+    gc=False,
+):
     """Result of a completed inference query.
 
     Represents the outcome of processing a Query, including the response text,
@@ -106,6 +131,15 @@ class QueryResult(msgspec.Struct, tag="query_result", kw_only=True, frozen=True)
     Note:
         The completed_at field is intentionally set internally to prevent
         benchmark result manipulation. Users must not override this timestamp.
+
+        gc=False: Safe because metadata contains only scalar key-value pairs.
+        Do NOT store cyclic references in metadata or response_output fields.
+
+        omit_defaults=True: Fields with static defaults (ie. those NOT using default_factory)
+        are omitted if value equals default.
+
+        array_like=True: Encodes as array instead of object (e.g. ["id", "chunk", false, {}]
+        instead of {"id": ..., "response_chunk": ..., ...}). Reduces payload size.
     """
 
     id: str = ""
@@ -143,7 +177,14 @@ def __post_init__(self):
                     self.response_output[k] = tuple(v)
 
 
-class StreamChunk(msgspec.Struct, tag="stream_chunk", kw_only=True):
+class StreamChunk(
+    msgspec.Struct,
+    tag="stream_chunk",
+    kw_only=True,
+    array_like=True,
+    omit_defaults=True,
+    gc=False,
+):
     """A single chunk from a streaming inference response.
 
     Streaming responses are sent incrementally as the model generates text.
@@ -163,6 +204,16 @@ class StreamChunk(msgspec.Struct, tag="stream_chunk", kw_only=True):
         Streaming "Hello World" might produce:
         >>> StreamChunk(id="q1", response_chunk="Hello", is_complete=False)
         >>> StreamChunk(id="q1", response_chunk=" World", is_complete=True)
+
+    Note:
+        gc=False: Safe because metadata contains only scalar key-value pairs.
+        Do NOT store cyclic references in metadata field.
+
+        omit_defaults=True: Fields with static defaults (ie. those NOT using default_factory)
+        are omitted if value equals default.
+
+        array_like=True: Encodes as array instead of object (e.g. ["id", "chunk", false, {}]
+        instead of {"id": ..., "response_chunk": ..., ...}). Reduces payload size.
     """
 
     id: str = ""
 
@@ -24,40 +24,67 @@
 # ============================================================================
 
 
-class SSEDelta(msgspec.Struct):
+# NOTE(vir): msgspec usage
+# omit_defaults=True: Fields with static defaults are omitted if value equals default (ie those not using default_factory)
+# gc=False: Safe for request/response structs with scalar and nested struct fields only.
+
+
+class SSEDelta(
+    msgspec.Struct,
+    omit_defaults=True,
+    gc=False
+):
     """SSE delta object containing content."""
 
     content: str = ""
     reasoning: str = ""
 
 
-class SSEChoice(msgspec.Struct):
+class SSEChoice(
+    msgspec.Struct,
+    omit_defaults=True,
+    gc=False
+):
     """SSE choice object containing delta."""
 
     delta: SSEDelta = msgspec.field(default_factory=SSEDelta)
     finish_reason: str | None = None
 
 
-class SSEMessage(msgspec.Struct):
+class SSEMessage(
+    msgspec.Struct,
+    omit_defaults=True,
+    gc=False
+):
     """SSE message structure for OpenAI streaming responses."""
 
     choices: list[SSEChoice] = msgspec.field(default_factory=list)
 
 
 # ============================================================================
-# OpenAI Chat Completion Types (msgspec-based)
+# OpenAI Chat Completion Types
 # ============================================================================
 
 
-class ChatMessage(msgspec.Struct, kw_only=True, omit_defaults=True):
+class ChatMessage(
+    msgspec.Struct,
+    kw_only=True,
+    omit_defaults=True,
+    gc=False
+):
     """Chat message in OpenAI format."""
 
     role: str
     content: str
     name: str | None = None
 
 
-class ChatCompletionRequest(msgspec.Struct, kw_only=True, omit_defaults=True):
+class ChatCompletionRequest(
+    msgspec.Struct,
+    kw_only=True,
+    omit_defaults=True,
+    gc=False
+):
     """OpenAI chat completion request."""
 
     model: str
@@ -76,32 +103,52 @@ class ChatCompletionRequest(msgspec.Struct, kw_only=True, omit_defaults=True):
     user: str | None = None
 
 
-class ChatCompletionResponseMessage(msgspec.Struct, kw_only=True, omit_defaults=True):
+class ChatCompletionResponseMessage(
+    msgspec.Struct,
+    kw_only=True,
+    omit_defaults=True,
+    gc=False
+):
     """Response message from OpenAI."""
 
     role: str
     content: str | None
     refusal: str | None
 
 
-class ChatCompletionChoice(msgspec.Struct, kw_only=True, omit_defaults=True):
+class ChatCompletionChoice(
+    msgspec.Struct,
+    kw_only=True,
+    omit_defaults=True,
+    gc=False
+):
     """A single choice in the completion response."""
 
     index: int
     message: ChatCompletionResponseMessage
     finish_reason: str | None
 
 
-class CompletionUsage(msgspec.Struct, kw_only=True, omit_defaults=True):
+class CompletionUsage(
+    msgspec.Struct,
+    kw_only=True,
+    omit_defaults=True,
+    gc=False
+):
     """Token usage statistics."""
 
     prompt_tokens: int
     completion_tokens: int
     total_tokens: int
 
 
-class ChatCompletionResponse(msgspec.Struct, kw_only=True, omit_defaults=True):
-    """OpenAI chat completion response (msgspec version)."""
+class ChatCompletionResponse(
+    msgspec.Struct,
+    kw_only=True,
+    omit_defaults=True,
+    gc=False
+):
+    """OpenAI chat completion response."""
 
     id: str
     object: str = "chat.completion"
 
@@ -0,0 +1,104 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Performance benchmarks for OpenAIAdapter (pydantic + orjson) using pytest-benchmark.
+
+Measures ns/op for encode_query, decode_response, decode_sse_message
+with varying payload sizes (0, 100, 1k, 8k, 32k). Run with:
+
+    pytest tests/performance/openai/test_adapter.py --benchmark-only --benchmark-columns=mean,stddev,ops
+"""
+
+import json
+
+import pytest
+
+from inference_endpoint.core.types import Query
+from inference_endpoint.openai.openai_adapter import OpenAIAdapter
+
+TEXT_SIZES = {
+    "empty": "",
+    "100": "x" * 100,
+    "1k": "x" * 1_000,
+    "8k": "x" * 8_000,
+    "32k": "x" * 32_000,
+}
+
+
+def make_query(text: str) -> Query:
+    """Create a Query for benchmarks."""
+    return Query(
+        id="test-id",
+        data={"prompt": text, "model": "test-model", "max_completion_tokens": 100},
+        headers={"Authorization": "Bearer token"},
+    )
+
+
+def make_response_bytes(text: str) -> bytes:
+    """Create OpenAI-compatible response JSON bytes."""
+    return json.dumps(
+        {
+            "id": "chatcmpl-test",
+            "object": "chat.completion",
+            "created": 1234567890,
+            "model": "test-model",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": text, "refusal": None},
+                    "finish_reason": "stop",
+                    "logprobs": None,
+                }
+            ],
+            "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
+            "system_fingerprint": "fp_test",
+        }
+    ).encode()
+
+
+def make_sse_bytes(text: str) -> bytes:
+    """Create SSE message JSON bytes."""
+    return json.dumps(
+        {
+            "choices": [
+                {"delta": {"content": text, "reasoning": ""}, "finish_reason": None}
+            ]
+        }
+    ).encode()
+
+
+@pytest.mark.parametrize("size_name,text", TEXT_SIZES.items(), ids=TEXT_SIZES.keys())
+def test_encode_query(benchmark, size_name, text):
+    """Benchmark encode_query (Query -> HTTP bytes)."""
+    query = make_query(text)
+    benchmark.group = "openai_adapter_encode_query"
+    benchmark(OpenAIAdapter.encode_query, query)
+
+
+@pytest.mark.parametrize("size_name,text", TEXT_SIZES.items(), ids=TEXT_SIZES.keys())
+def test_decode_response(benchmark, size_name, text):
+    """Benchmark decode_response (HTTP bytes -> QueryResult)."""
+    response_bytes = make_response_bytes(text)
+    benchmark.group = "openai_adapter_decode_response"
+    benchmark(OpenAIAdapter.decode_response, response_bytes, "test-id")
+
+
+@pytest.mark.parametrize("size_name,text", TEXT_SIZES.items(), ids=TEXT_SIZES.keys())
+def test_decode_sse(benchmark, size_name, text):
+    """Benchmark decode_sse_message (SSE bytes -> content)."""
+    sse_bytes = make_sse_bytes(text)
+    benchmark.group = "openai_adapter_decode_sse"
+    benchmark(OpenAIAdapter.decode_sse_message, sse_bytes)