getter111
diff --git a/‎packages/nvidia_nat_adk/src/nat/plugins/adk/llm.py‎
Lines changed: 4 additions & 4 deletions b/‎packages/nvidia_nat_adk/src/nat/plugins/adk/llm.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎packages/nvidia_nat_adk/tests/test_adk_llm.py‎
Lines changed: 8 additions & 4 deletions b/‎packages/nvidia_nat_adk/tests/test_adk_llm.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎packages/nvidia_nat_core/src/nat/llm/dynamo_llm.py‎
Lines changed: 129 additions & 51 deletions b/‎packages/nvidia_nat_core/src/nat/llm/dynamo_llm.py‎
Lines changed: 129 additions & 51 deletions
@@ -179,8 +179,8 @@ async def dynamo_adk(config: DynamoModelConfig, _builder: Builder):
     if config.base_url:
         config_dict["api_base"] = config.base_url
 
-    # Build Dynamo prefix headers if prefix_template is configured
-    if config.prefix_template is not None:
+    # Build Dynamo prefix headers if prefix_template is configured and headers are enabled
+    if config.prefix_template is not None and not config.disable_headers:
         # Generate a static prefix ID for this LLM instance
         # For dynamic prefix IDs, users should use the LangChain client or manage sessions manually
         unique_id = uuid.uuid4().hex[:16]
@@ -189,8 +189,8 @@ async def dynamo_adk(config: DynamoModelConfig, _builder: Builder):
         extra_headers = {
             "x-prefix-id": prefix_id,
             "x-prefix-total-requests": str(config.prefix_total_requests),
-            "x-prefix-osl": config.prefix_osl.upper(),
-            "x-prefix-iat": config.prefix_iat.upper(),
+            "x-prefix-osl": str(config.prefix_osl),
+            "x-prefix-iat": str(config.prefix_iat),
         }
         config_dict["extra_headers"] = extra_headers
 
 
@@ -199,8 +199,9 @@ def dynamo_cfg_with_prefix(self):
             base_url="http://localhost:8000/v1",
             prefix_template="session-{uuid}",
             prefix_total_requests=15,
-            prefix_osl="HIGH",
-            prefix_iat="LOW",
+            prefix_osl=2048,
+            prefix_iat=50,
+            disable_headers=False,
         )
 
     @patch('google.adk.models.lite_llm.LiteLlm')
@@ -238,8 +239,8 @@ async def test_creation_with_prefix_template(self, mock_litellm_class, dynamo_cf
             assert "x-prefix-id" in headers
             assert headers["x-prefix-id"].startswith("session-")
             assert headers["x-prefix-total-requests"] == "15"
-            assert headers["x-prefix-osl"] == "HIGH"
-            assert headers["x-prefix-iat"] == "LOW"
+            assert headers["x-prefix-osl"] == "2048"
+            assert headers["x-prefix-iat"] == "50"
 
             assert client is mock_llm_instance
 
@@ -268,6 +269,8 @@ async def test_excludes_dynamo_specific_fields(self, mock_litellm_class, dynamo_
         assert "prefix_total_requests" not in kwargs
         assert "prefix_osl" not in kwargs
         assert "prefix_iat" not in kwargs
+        assert "prefix_use_raw_values" not in kwargs
+        assert "disable_headers" not in kwargs
         assert "request_timeout" not in kwargs
 
     @patch('google.adk.models.lite_llm.LiteLlm')
@@ -280,6 +283,7 @@ async def test_prefix_id_is_unique_per_instance(self, mock_litellm_class, mock_b
         config = DynamoModelConfig(
             model_name="test-model",
             prefix_template="session-{uuid}",
+            disable_headers=False,
         )
 
         prefix_ids = set()
 
@@ -39,18 +39,28 @@
 -------------------------
 
 prefix_osl (Output Sequence Length)
-    Hint for expected response length:
+    Expected output tokens for response length hinting. By default, the raw
+    integer value is sent. When ``prefix_use_raw_values`` is False, values are
+    converted to categories:
 
-    - LOW: decode_cost=1.0, short responses
-    - MEDIUM: decode_cost=2.0, typical responses
-    - HIGH: decode_cost=3.0, long responses
+    - < 256 tokens: LOW (decode_cost=1.0, short responses)
+    - < 1024 tokens: MEDIUM (decode_cost=2.0, typical responses)
+    - >= 1024 tokens: HIGH (decode_cost=3.0, long responses)
+
+    Accepts categorical strings (LOW/MEDIUM/HIGH) for backward compatibility,
+    which are converted to representative token counts (128/512/2048).
 
 prefix_iat (Inter-Arrival Time)
-    Hint for request pacing:
+    Expected inter-arrival time in milliseconds. By default, the raw integer
+    value is sent. When ``prefix_use_raw_values`` is False, values are converted
+    to categories:
+
+    - < 100ms: LOW (iat_factor=1.5, rapid bursts, high worker stickiness)
+    - < 500ms: MEDIUM (iat_factor=1.0, normal pacing)
+    - >= 500ms: HIGH (iat_factor=0.6, slow requests, more exploration)
 
-    - LOW: iat_factor=1.5, rapid bursts -> high worker stickiness
-    - MEDIUM: iat_factor=1.0, normal pacing
-    - HIGH: iat_factor=0.6, slow requests -> more exploration
+    Accepts categorical strings (LOW/MEDIUM/HIGH) for backward compatibility,
+    which are converted to representative millisecond values (50/250/750).
 
 prefix_total_requests
     Expected requests per conversation:
@@ -74,6 +84,7 @@
     from nat.profiler.prediction_trie.trie_lookup import PredictionTrieLookup
 
 from pydantic import Field
+from pydantic import field_validator
 
 from nat.builder.builder import Builder
 from nat.builder.context import Context
@@ -90,6 +101,17 @@
 # Define valid prefix hint values
 PrefixLevel = Literal["LOW", "MEDIUM", "HIGH"]
 
+# Representative token counts for categorical levels (midpoint of ranges):
+# LOW: 128 tokens (midpoint of 0-256 range)
+# MEDIUM: 512 tokens (midpoint of 256-1024 range)
+# HIGH: 2048 tokens (midpoint of 1024-4096 range)
+_OSL_CATEGORY_TO_INT: dict[str, int] = {"LOW": 128, "MEDIUM": 512, "HIGH": 2048}
+# Representative interarrival times for categorical levels (midpoint of ranges):
+# LOW: 50ms (midpoint of 0-100ms range)
+# MEDIUM: 250ms (midpoint of 100-500ms range)
+# HIGH: 750ms (midpoint of 500-1000ms range)
+_IAT_CATEGORY_TO_INT: dict[str, int] = {"LOW": 50, "MEDIUM": 250, "HIGH": 750}
+
 # =============================================================================
 # CATEGORY CONVERSION HELPERS
 # =============================================================================
@@ -314,22 +336,22 @@ class DynamoModelConfig(OpenAIModelConfig, name="dynamo"):
                      "Lower values allow more load balancing across workers."),
         space=SearchSpace(low=1, high=20, step=5))
 
-    prefix_osl: PrefixLevel = OptimizableField(
-        default="MEDIUM",
-        description="Output Sequence Length hint for the Dynamo router. "
-        "LOW means short responses (decode_cost=1.0), "
-        "MEDIUM means typical (decode_cost=2.0), "
-        "HIGH means long responses (decode_cost=3.0).",
-        space=SearchSpace(values=["LOW", "MEDIUM", "HIGH"]),
+    prefix_osl: int = OptimizableField(
+        default=512,
+        ge=1,
+        description="Expected output tokens for response length hinting (Output Sequence Length). "
+        "Raw integer value is sent by default. Accepts categorical strings "
+        "(LOW/MEDIUM/HIGH) for backward compatibility (mapped to 128/512/2048).",
+        space=SearchSpace(low=64, high=4096, step=64),
     )
 
-    prefix_iat: PrefixLevel = OptimizableField(
-        default="MEDIUM",
-        description="Inter-Arrival Time hint for the Dynamo router. "
-        "LOW means rapid bursts (iat_factor=1.5, high stickiness), "
-        "MEDIUM means normal (iat_factor=1.0), "
-        "HIGH means slow requests (iat_factor=0.6, more exploration).",
-        space=SearchSpace(values=["LOW", "MEDIUM", "HIGH"]),
+    prefix_iat: int = OptimizableField(
+        default=250,
+        ge=1,
+        description="Expected inter-arrival time in milliseconds for request pacing. "
+        "Raw integer value is sent by default. Accepts categorical strings "
+        "(LOW/MEDIUM/HIGH) for backward compatibility (mapped to 50/250/750).",
+        space=SearchSpace(low=10, high=1000, step=50),
     )
 
     request_timeout: float = Field(
@@ -338,12 +360,49 @@ class DynamoModelConfig(OpenAIModelConfig, name="dynamo"):
         description="HTTP request timeout in seconds for LLM requests.",
     )
 
+    prefix_use_raw_values: bool = Field(
+        default=True,
+        description="When True, send raw integer values for OSL (output tokens) and IAT (interarrival ms) "
+        "in headers and nvext.agent_hints. When False, convert to categorical LOW/MEDIUM/HIGH.",
+    )
+
     prediction_trie_path: str | None = Field(
         default=None,
         description="Path to prediction_trie.json file. When set, predictions are "
         "looked up and used to override both HTTP headers and nvext.agent_hints for each LLM call.",
     )
 
+    disable_headers: bool = Field(
+        default=True,
+        description="If True, do not inject Dynamo prefix hints as HTTP headers. "
+        "Hints will still be injected via nvext.agent_hints in the request body if prefix_template is set.",
+    )
+
+    # =========================================================================
+    # VALIDATORS (backward compatibility: categorical strings -> integers)
+    # =========================================================================
+
+    @field_validator("prefix_osl", mode="before")
+    @classmethod
+    def _coerce_prefix_osl(cls, v: object) -> int:
+        if isinstance(v, int):
+            return v
+        if isinstance(v, str):
+            upper = v.upper()
+            if upper in _OSL_CATEGORY_TO_INT:
+                return _OSL_CATEGORY_TO_INT[upper]
+            raise ValueError(f"Invalid OSL value '{v}'. Must be an integer >= 1 "
+                             f"or one of: {', '.join(_OSL_CATEGORY_TO_INT.keys())}")
+        raise TypeError(f"prefix_osl must be int or str, got {type(v)}")
+
+    @field_validator("prefix_iat", mode="before")
+    @classmethod
+    def _coerce_prefix_iat(cls, v: object) -> object:
+        """Convert categorical IAT strings (LOW/MEDIUM/HIGH) to representative millisecond values."""
+        if isinstance(v, str) and v.upper() in _IAT_CATEGORY_TO_INT:
+            return _IAT_CATEGORY_TO_INT[v.upper()]
+        return v
+
     # =========================================================================
     # UTILITY METHODS
     # =========================================================================
@@ -371,8 +430,10 @@ def get_dynamo_field_names() -> frozenset[str]:
             "prefix_total_requests",
             "prefix_osl",
             "prefix_iat",
+            "prefix_use_raw_values",
             "request_timeout",
             "prediction_trie_path",
+            "disable_headers",
         })
 
 
@@ -397,15 +458,19 @@ def __init__(
         self,
         transport: httpx.AsyncBaseTransport,
         total_requests: int,
-        osl: str,
-        iat: str,
+        osl: int,
+        iat: int,
         prediction_lookup: "PredictionTrieLookup | None" = None,
+        use_raw_values: bool = True,
+        disable_headers: bool = True,
     ):
         self._transport = transport
         self._total_requests = total_requests
-        self._osl = osl.upper()
-        self._iat = iat.upper()
+        self._osl = osl
+        self._iat = iat
         self._prediction_lookup = prediction_lookup
+        self._use_raw_values = use_raw_values
+        self._disable_headers = disable_headers
 
     async def handle_async_request(self, request: "httpx.Request") -> "httpx.Response":
         # Get prefix ID from context (supports depth-awareness and overrides)
@@ -419,10 +484,10 @@ async def handle_async_request(self, request: "httpx.Request") -> "httpx.Respons
             # If context not available or latency_sensitivity not implemented yet, default to MEDIUM
             latency_sensitivity = "MEDIUM"
 
-        # Initialize with static config values
+        # Initialize with static config values (always integers)
         total_requests = self._total_requests
-        osl = self._osl
-        iat = self._iat
+        osl_raw = self._osl
+        iat_raw = self._iat
 
         # Check for prediction override
         if self._prediction_lookup is not None:
@@ -445,19 +510,17 @@ async def handle_async_request(self, request: "httpx.Request") -> "httpx.Respons
                 if prediction:
                     # Override with prediction-derived values
                     total_requests = int(prediction.remaining_calls.mean)
-                    osl = _output_tokens_to_osl(prediction.output_tokens.p90)
-                    iat = _interarrival_ms_to_iat(prediction.interarrival_ms.mean)
+                    osl_raw = int(prediction.output_tokens.p90)
+                    iat_raw = int(prediction.interarrival_ms.mean)
 
                     logger.debug(
                         "Overriding hints from prediction: path=%s, call_index=%d, "
-                        "total_requests=%d, osl=%s (tokens=%d), iat=%s (ms=%d)",
+                        "total_requests=%d, osl_raw=%d, iat_raw=%d",
                         path,
                         call_index,
                         total_requests,
-                        osl,
-                        int(prediction.output_tokens.p90),
-                        iat,
-                        int(prediction.interarrival_ms.mean),
+                        osl_raw,
+                        iat_raw,
                     )
                 else:
                     logger.debug(
@@ -469,26 +532,35 @@ async def handle_async_request(self, request: "httpx.Request") -> "httpx.Respons
             except Exception:
                 logger.exception("Failed to lookup prediction")
 
-        # Inject HTTP headers
+        # Compute final values for headers/body
+        if self._use_raw_values:
+            osl_value: int | str = osl_raw
+            iat_value: int | str = iat_raw
+        else:
+            osl_value = _output_tokens_to_osl(osl_raw)
+            iat_value = _interarrival_ms_to_iat(iat_raw)
+
         headers = dict(request.headers)
-        headers[f"{LLMHeaderPrefix.DYNAMO}-id"] = prefix_id
-        headers[f"{LLMHeaderPrefix.DYNAMO}-total-requests"] = str(total_requests)
-        headers[f"{LLMHeaderPrefix.DYNAMO}-osl"] = osl
-        headers[f"{LLMHeaderPrefix.DYNAMO}-iat"] = iat
-        headers[f"{LLMHeaderPrefix.DYNAMO}-latency-sensitivity"] = latency_sensitivity
+        if not self._disable_headers:
+            # Headers always need strings
+            headers[f"{LLMHeaderPrefix.DYNAMO}-id"] = prefix_id
+            headers[f"{LLMHeaderPrefix.DYNAMO}-total-requests"] = str(total_requests)
+            headers[f"{LLMHeaderPrefix.DYNAMO}-osl"] = str(osl_value)
+            headers[f"{LLMHeaderPrefix.DYNAMO}-iat"] = str(iat_value)
+            headers[f"{LLMHeaderPrefix.DYNAMO}-latency-sensitivity"] = latency_sensitivity
 
         # Modify body to inject nvext.agent_hints (if JSON POST request)
         content = request.content
         if request.method == "POST" and content:
             try:
                 body = json.loads(content.decode("utf-8", errors="replace"))
                 if isinstance(body, dict):
-                    # Build agent_hints dict
+                    # Build agent_hints dict (int or str depending on raw mode)
                     agent_hints = {
                         "prefix_id": prefix_id,
                         "total_requests": total_requests,
-                        "osl": osl,
-                        "iat": iat,
+                        "osl": osl_value,
+                        "iat": iat_value,
                         "latency_sensitivity": latency_sensitivity,
                     }
 
@@ -527,8 +599,8 @@ async def handle_async_request(self, request: "httpx.Request") -> "httpx.Respons
         logger.debug("Injected Dynamo hints: prefix_id=%s, total_requests=%d, osl=%s, iat=%s, latency_sensitivity=%s",
                      prefix_id,
                      total_requests,
-                     osl,
-                     iat,
+                     osl_value,
+                     iat_value,
                      latency_sensitivity)
 
         return await self._transport.handle_async_request(new_request)
@@ -546,10 +618,12 @@ async def aclose(self) -> None:
 def create_httpx_client_with_dynamo_hooks(
     prefix_template: str | None,
     total_requests: int,
-    osl: str,
-    iat: str,
+    osl: int,
+    iat: int,
     timeout: float = 600.0,
     prediction_lookup: "PredictionTrieLookup | None" = None,
+    use_raw_values: bool = True,
+    disable_headers: bool = True,
 ) -> "httpx.AsyncClient":
     """
     Create an httpx.AsyncClient with Dynamo hint injection via custom transport.
@@ -564,10 +638,12 @@ def create_httpx_client_with_dynamo_hooks(
     Args:
         prefix_template: Template string with {uuid} placeholder (unused, kept for API compat)
         total_requests: Expected number of requests for this prefix
-        osl: Output sequence length hint (LOW/MEDIUM/HIGH)
-        iat: Inter-arrival time hint (LOW/MEDIUM/HIGH)
+        osl: Expected output tokens (raw integer value)
+        iat: Expected inter-arrival time in milliseconds (raw integer value)
         timeout: HTTP request timeout in seconds
         prediction_lookup: Optional PredictionTrieLookup for dynamic hint injection
+        use_raw_values: When True send raw integers; when False convert to LOW/MEDIUM/HIGH
+        disable_headers: If True, do not inject hints as HTTP headers (still injects nvext.agent_hints)
 
     Returns:
         An httpx.AsyncClient configured with Dynamo hint injection.
@@ -586,6 +662,8 @@ def create_httpx_client_with_dynamo_hooks(
         osl=osl,
         iat=iat,
         prediction_lookup=prediction_lookup,
+        use_raw_values=use_raw_values,
+        disable_headers=disable_headers,
     )
 
     return httpx.AsyncClient(