[https://nvbugs/5633700][fix] Cache tiktoken vocab for gpt-oss (#10219)

LinPoly · web-flow · commit c5b0f9e436ac · 2025-12-26T18:39:03.000+08:00
Signed-off-by: Pengyun Lin &lt;81065165+LinPoly@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/serve/harmony_adapter.py b/tensorrt_llm/serve/harmony_adapter.py
@@ -217,8 +217,9 @@ def _create_delta_from_parser_state(self) -> dict[str, Any] | None:
 
                 # Check if tool is allowed
                 if self.should_filter_tools and func_name not in self.available_tools:
-                    logger.debug("Request %s: tool %s not in available tools",
-                                 self.request_id, func_name)
+                    logger.debug(
+                        f"Request {self.request_id}: tool {func_name} not in available tools"
+                    )
                     return None
 
                 # Get or create tool call
@@ -273,8 +274,9 @@ def _create_delta_from_parser_state(self) -> dict[str, Any] | None:
             else:
                 return {"content": self.parser.last_content_delta}
         else:
-            logger.debug("Request %s: no delta generated for channel=%s",
-                         self.request_id, self.parser.current_channel)
+            logger.debug(
+                f"Request {self.request_id}: no delta generated for channel={self.parser.current_channel}"
+            )
             return None
 
     def _get_or_create_tool_call(self, func_name: str) -> str:
@@ -295,8 +297,9 @@ def _get_or_create_tool_call(self, func_name: str) -> str:
             "active": True
         }
         self.tool_call_index += 1
-        logger.debug("Request %s: created new tool call %s for function %s",
-                     self.request_id, tool_id, func_name)
+        logger.debug(
+            f"Request {self.request_id}: created new tool call {tool_id} for function {func_name}"
+        )
         return tool_id
 
     def get_debug_info(self) -> dict[str, Any]:
@@ -896,8 +899,8 @@ def _parse_tool_call_from_harmony_message(
                 }
             except json.JSONDecodeError:
                 logger.warning(
-                    "Failed to parse tool call arguments as JSON: %s",
-                    function_call_args)
+                    f"Failed to parse tool call arguments as JSON: {function_call_args}"
+                )
                 return None
         elif msg_content_type and "code" in msg_content_type:
             function_name = str(msg_recipient)
@@ -1023,10 +1026,11 @@ def harmony_output_to_openai(
             except (HarmonyError, UnicodeDecodeError,
                     ValueError) as parse_error:
                 logger.warning(
-                    "Failed to parse harmony messages from tokens: %s",
-                    parse_error)
-                logger.debug("Problematic clean tokens (%d): %s",
-                             len(clean_tokens), clean_tokens)
+                    f"Failed to parse harmony messages from tokens: {parse_error}"
+                )
+                logger.debug(
+                    f"Problematic clean tokens ({len(clean_tokens)}): {clean_tokens}"
+                )
                 # Fallback to raw text parsing
                 raise RuntimeError(f"Harmony parsing failed: {parse_error}"
                                    )  # This will be caught by outer try-catch
@@ -1103,9 +1107,9 @@ def harmony_output_to_openai(
         except Exception as e:
             raw_text = self._safe_decode_utf8(harmony_output_tokens,
                                               "HARMONY _OUTPUT: ")
-            logger.warning("Failed to parse harmony output: %s. Raw output: %s",
-                           e, raw_text)
-            logger.debug("Detailed error: %s", traceback.format_exc())
+            logger.warning(
+                f"Failed to parse harmony output: {e}. Raw output: {raw_text}")
+            logger.debug(f"Detailed error: {traceback.format_exc()}")
 
             # Check if raw_text contains a decode error (fallback content)
             if "HARMONY_OUTPUT:" in raw_text:
@@ -1276,9 +1280,9 @@ def stateful_stream_harmony_tokens_to_openai_deltas(
             return deltas
         except (HarmonyError, UnicodeDecodeError, ValueError):
             logger.error(
-                f"Streaming: Failed to process token batch of {len(tokens)} tokens for request {request_id}",
+                f"Streaming: Failed to process token batch of {len(tokens)} tokens for request {request_id}"
             )
-            logger.debug("Problematic streaming tokens: %s", tokens)
+            logger.debug(f"Problematic streaming tokens: {tokens}")
 
             # Return empty deltas to continue processing
             return []
@@ -1457,8 +1461,8 @@ def create_stream_state(
         """
         if request_id in self._stream_states:
             logger.warning(
-                "Stream state already exists for request %s, replacing",
-                request_id)
+                f"Stream state already exists for request {request_id}, replacing"
+            )
 
         stream_state = HarmonyStreamState(
             request_id=request_id,
@@ -1494,7 +1498,7 @@ def _filter_tool_calls(
 
             # Filter unavailable external tools
             if should_filter_external_tools and func_name not in external_tools:
-                logger.debug("Filtered unavailable tool call: %s", func_name)
+                logger.debug(f"Filtered unavailable tool call: {func_name}")
                 continue
 
             filtered.append(tool_call)
@@ -1644,7 +1648,7 @@ def handle_non_streaming_response(tools: List[ChatCompletionToolsParam],
         output.token_ids, tools_for_parser, tool_choice)
 
     # CONVERTED OUTPUT (after harmony to openai conversion)
-    logger.debug("✅ CONVERTED OUTPUT: %s", json.dumps(parsed_output, indent=2))
+    logger.debug(f"✅ CONVERTED OUTPUT: {json.dumps(parsed_output, indent=2)}")
 
     # Create response message
     response_message = _create_response_message(parsed_output)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -367,7 +367,6 @@ accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutla
 accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] SKIP (https://nvbugs/5702795)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5648560)
 test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5648560)
-test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5633700)
 accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4] SKIP (https://nvbugs/5705193)
 accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (https://nvbugs/5705193)
 accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[cp2] SKIP (https://nvbugs/5705194)
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_harmony.py b/tests/unittest/llmapi/apps/_test_openai_chat_harmony.py
@@ -1,12 +1,18 @@
 import json
+import os
 
 import openai
 import pytest
+from utils.llm_data import llm_datasets_root
 
 from ..test_llm import get_model_path
 from .openai_server import RemoteOpenAIServer
 
 pytestmark = pytest.mark.threadleak(enabled=False)
+os.environ['TIKTOKEN_RS_CACHE_DIR'] = os.path.join(llm_datasets_root(),
+                                                   'tiktoken_vocab')
+os.environ['TIKTOKEN_ENCODINGS_BASE'] = os.path.join(llm_datasets_root(),
+                                                     'tiktoken_vocab')
 
 
 @pytest.fixture(scope="module", ids=["GPT-OSS-20B"])
@@ -114,8 +120,10 @@ async def test_tool_calls(client: openai.AsyncOpenAI, model: str):
         model=model,
         messages=messages,
         tools=[tool_get_current_weather],
+        extra_body={"top_k": 1},
     )
     message = response.choices[0].message
+    print(message)
     assert response.choices[0].finish_reason == "tool_calls"
     assert message.content is None
     assert message.reasoning
@@ -137,6 +145,7 @@ async def test_tool_calls(client: openai.AsyncOpenAI, model: str):
     response = await client.chat.completions.create(
         model=model,
         messages=messages,
+        extra_body={"top_k": 1},
     )
     message = response.choices[0].message
     assert message.content
@@ -205,6 +214,7 @@ async def test_streaming_tool_call(client: openai.AsyncOpenAI, model: str):
         messages=messages,
         tools=[tool_get_current_weather],
         stream=True,
+        extra_body={"top_k": 1},
     )
     tool_name: str
     reasoning_chunks: list[str] = []