Address feedback

matthewkotila · matthewkotila · commit e2c3be076545 · 2026-03-13T17:38:56.000-07:00
diff --git a/docs/cli-options.md b/docs/cli-options.md
@@ -167,7 +167,7 @@ Use the legacy 'max_tokens' field instead of 'max_completion_tokens' in request
 
 #### `--use-server-token-count`
 
-[Deprecated] This flag is a no-op and will be removed in a future release. AIPerf now always computes both client-side and server-reported token counts. Server counts are preferred for output metrics; client counts are used for input validation.
+[Deprecated] This flag is a no-op and will be removed in a future release. AIPerf now prefers server-reported token counts when available and falls back to client-side tokenization for input. Use --tokenize-output to enable client-side output tokenization.
 <br/>_Flag (no value required)_
 
 #### `--stream-usage`, `--no-stream-usage`
@@ -675,7 +675,7 @@ Enable client-side tokenization of output and reasoning tokens, even when the se
 
 #### `--tokenize-input`, `--no-tokenize-input`
 
-Enable client-side tokenization of input prompts for every request. When enabled, locally computed input token counts are always stored in token_counts.input_local. When disabled, client-side input tokenization only occurs as a fallback when the server does not report prompt tokens. Automatically set to False for user-provided input datasets (--custom-dataset-type or --public-dataset) unless explicitly overridden.
+Enable client-side tokenization of input prompts for every request. When enabled, locally computed input token counts are always stored in token_counts.input_local. When disabled, client-side input tokenization only occurs as a fallback when the server does not report prompt tokens. Use --no-tokenize-input to disable client-side input tokenization entirely, including fallback. Automatically set to False for user-provided input datasets (--custom-dataset-type or --public-dataset) unless explicitly overridden.
 <br/>_Default: `True`_
 
 ### Load Generator
diff --git a/docs/metrics-reference.md b/docs/metrics-reference.md
@@ -394,15 +394,18 @@ The number of input/prompt tokens for a single request. This represents the size
 **Formula:**
 ```python
 # Server-preferred (falls back to client-side)
-input_sequence_length = usage.prompt_tokens or len(tokenizer.encode(prompt, add_special_tokens=False))
+if usage.prompt_tokens is not None:
+    input_sequence_length = usage.prompt_tokens
+else:
+    input_sequence_length = len(tokenizer.encode(prompt, add_special_tokens=False))
 ```
 
 **Notes:**
 - When the server reports `usage.prompt_tokens`, that value is used for ISL (and thus for console display and derived metrics).
-- Falls back to client-side tokenization when server does not report prompt token counts.
+- Falls back to client-side tokenization when server does not report prompt token counts, unless `--no-tokenize-input` is explicitly specified.
 - Client-side tokenization uses `add_special_tokens=False` to count only content tokens.
 - Automatically disabled for user-provided input datasets; use `--tokenize-input` to force.
-- Use `--no-tokenize-input` to skip when relying on server-reported prompt tokens.
+- Use `--no-tokenize-input` to disable client-side input tokenization entirely (no fallback).
 - Useful for understanding the relationship between input size and latency/throughput.
 
 ---
@@ -813,7 +816,7 @@ total_usage_total_tokens = sum(r.usage_total_tokens for r in records if r.valid)
 ## Usage Discrepancy Metrics
 
 > [!NOTE]
-> These metrics measure the percentage difference between API-reported token counts (`usage` fields) and client-computed token counts. They are **not displayed in console output** but help identify tokenizer mismatches or counting discrepancies. Prompt diff requires `--tokenize-input` (or fallback tokenization when server omits prompt tokens) for user-provided datasets. Output and reasoning diff metrics require `--tokenize-output` to populate both server and client values.
+> These metrics measure the percentage difference between API-reported token counts (`usage` fields) and client-computed token counts. They are **not displayed in console output** but help identify tokenizer mismatches or counting discrepancies. Prompt diff requires both server-reported and client-computed input token counts. Client-side input tokenization is used as a fallback when the server omits prompt tokens, unless `--no-tokenize-input` is explicitly specified. Output and reasoning diff metrics require `--tokenize-output` to populate both server and client values.
 
 ### Usage Prompt Tokens Diff %
 
diff --git a/src/aiperf/common/config/endpoint_config.py b/src/aiperf/common/config/endpoint_config.py
@@ -230,8 +230,9 @@ def url(self) -> str:
         Field(
             description=(
                 "[Deprecated] This flag is a no-op and will be removed in a future release. "
-                "AIPerf now always computes both client-side and server-reported token counts. "
-                "Server counts are preferred for output metrics; client counts are used for input validation."
+                "AIPerf now prefers server-reported token counts when available and falls back "
+                "to client-side tokenization for input. Use --tokenize-output to enable "
+                "client-side output tokenization."
             ),
         ),
         CLIParameter(
diff --git a/src/aiperf/common/config/tokenizer_config.py b/src/aiperf/common/config/tokenizer_config.py
@@ -80,6 +80,8 @@ class TokenizerConfig(BaseConfig):
             "When enabled, locally computed input token counts are always stored "
             "in token_counts.input_local. When disabled, client-side input tokenization "
             "only occurs as a fallback when the server does not report prompt tokens. "
+            "Use --no-tokenize-input to disable client-side input tokenization entirely, "
+            "including fallback. "
             "Automatically set to False for user-provided input datasets "
             "(--custom-dataset-type or --public-dataset) unless explicitly overridden.",
         ),
diff --git a/src/aiperf/common/models/record_models.py b/src/aiperf/common/models/record_models.py
@@ -881,30 +881,18 @@ def __post_init__(self) -> None:
 class TokenCounts:
     """Token counts for a record."""
 
-    input: int | None = Field(
-        default=None,
-        description="The server-reported prompt token count from the API usage field. If None, the server did not report prompt tokens.",
-    )
-    input_local: int | None = Field(
-        default=None,
-        description="The number of input tokens computed by the client-side tokenizer. If None, the number of tokens could not be calculated.",
-    )
-    output: int | None = Field(
-        default=None,
-        description="The server-reported output token count (completion minus reasoning). If None, the server did not report completion tokens.",
-    )
-    output_local: int | None = Field(
-        default=None,
-        description="The number of output tokens computed by the client-side tokenizer.",
-    )
-    reasoning: int | None = Field(
-        default=None,
-        description="The server-reported reasoning token count. If None, the server did not report reasoning tokens.",
-    )
-    reasoning_local: int | None = Field(
-        default=None,
-        description="The number of reasoning tokens computed by the client-side tokenizer.",
-    )
+    input: int | None = None
+    """Server-reported prompt token count from the API usage field."""
+    input_local: int | None = None
+    """Input tokens computed by the client-side tokenizer."""
+    output: int | None = None
+    """Server-reported output token count (completion minus reasoning)."""
+    output_local: int | None = None
+    """Output tokens computed by the client-side tokenizer."""
+    reasoning: int | None = None
+    """Server-reported reasoning token count."""
+    reasoning_local: int | None = None
+    """Reasoning tokens computed by the client-side tokenizer."""
 
 
 @dataclass
diff --git a/src/aiperf/records/inference_result_parser.py b/src/aiperf/records/inference_result_parser.py
@@ -54,6 +54,7 @@ def __init__(
             "tokenize_input" in user_config.tokenizer.model_fields_set
             and not user_config.tokenizer.tokenize_input
         )
+        self._warned_no_usage: bool = False
         if (
             self.model_endpoint.endpoint.streaming
             and self.model_endpoint.endpoint.stream_usage
@@ -63,12 +64,19 @@ def __init__(
                 "Server-reported token counts will be requested. "
                 "Use --no-stream-usage if the server does not support stream_options."
             )
-        if not self.disable_tokenization and not self.tokenize_input:
+        if not self.disable_tokenization and self._explicit_no_tokenize_input:
             self.info(
-                "Input tokenization is disabled. "
-                "Usage prompt token diff metrics will not be available. "
+                "Input tokenization is disabled (--no-tokenize-input). "
+                "Client-side input token counts will not be computed, even as fallback. "
                 "Use --tokenize-input to enable."
             )
+        elif not self.disable_tokenization and not self.tokenize_input:
+            self.info(
+                "Always-on input tokenization is disabled. "
+                "Client-side input tokenization will still occur as a fallback "
+                "when the server does not report prompt tokens. "
+                "Use --tokenize-input to enable for all requests."
+            )
         if not self.disable_tokenization and not self.tokenize_output:
             self.info(
                 "Output tokenization is disabled. "
@@ -311,21 +319,25 @@ async def _compute_token_counts(
             responses, reasoning_server
         )
 
-        # Warn if server provided no usage information at all
+        # Warn once if server provided no usage information at all
         if (
-            input_token_count is None
+            not self._warned_no_usage
+            and input_token_count is None
             and output_server is None
             and reasoning_server is None
         ):
+            self._warned_no_usage = True
             self.warning(
                 "Server did not provide token usage information. Token count metrics will be unavailable. "
                 "Verify that your API endpoint supports usage reporting (stream_options are automatically configured for OpenAI-compatible endpoints)."
             )
 
         # Client-side input tokenization
         input_local: int | None = None
-        if not self.disable_tokenization and (
-            self.tokenize_input or input_token_count is None
+        if (
+            not self.disable_tokenization
+            and not self._explicit_no_tokenize_input
+            and (self.tokenize_input or input_token_count is None)
         ):
             try:
                 input_local = await self.compute_input_token_count(request_record)
diff --git a/tests/unit/records/test_inference_result_parser.py b/tests/unit/records/test_inference_result_parser.py
@@ -560,6 +560,23 @@ async def test_no_tokenize_input_fallback(
         assert result.token_counts.input is None
         assert result.token_counts.input_local == 8  # 8 words in sample turn
 
+    async def test_explicit_no_tokenize_input_skips_fallback(
+        self, setup_inference_parser, request_record, spy_tokenizer
+    ):
+        """Explicit --no-tokenize-input without server usage → no fallback."""
+        setup_inference_parser.tokenize_input = False
+        setup_inference_parser._explicit_no_tokenize_input = True
+        setup_inference_parser.get_tokenizer = AsyncMock(return_value=spy_tokenizer)
+        setup_parser_responses(
+            setup_inference_parser,
+            [make_parsed_response(text="output", include_usage=False)],
+        )
+
+        result = await setup_inference_parser.process_valid_record(request_record)
+
+        assert result.token_counts.input is None
+        assert result.token_counts.input_local is None
+
     async def test_tokenize_input_always_computes(
         self, setup_inference_parser, request_record, spy_tokenizer
     ):