openai · hrithiksagar-tih · Aug 11, 2025
diff --git a/articles/gpt-oss/run-vllm.md b/articles/gpt-oss/run-vllm.md
@@ -170,6 +170,11 @@ uv pip install openai-harmony
 Afterwards you can use harmony to encode and parse the tokens generated by vLLM’s generate function.
 
 ```py
+# source .oss/bin/activate
+
+import os
+os.environ["VLLM_USE_FLASHINFER_SAMPLER"] = "0"
+
 import json
 from openai_harmony import (
     HarmonyEncodingName,
@@ -180,12 +185,13 @@ from openai_harmony import (
     SystemContent,
     DeveloperContent,
 )
-
+ 
 from vllm import LLM, SamplingParams
+import os
 
 # --- 1) Render the prefill with Harmony ---
 encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-
+ 
 convo = Conversation.from_messages(
     [
         Message.from_role_and_content(Role.SYSTEM, SystemContent.new()),
@@ -196,37 +202,41 @@ convo = Conversation.from_messages(
         Message.from_role_and_content(Role.USER, "What is the weather like in SF?"),
     ]
 )
-
+ 
 prefill_ids = encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
-
+ 
 # Harmony stop tokens (pass to sampler so they won't be included in output)
 stop_token_ids = encoding.stop_tokens_for_assistant_actions()
-
+ 
 # --- 2) Run vLLM with prefill ---
 llm = LLM(
-    model="openai/gpt-oss-120b",
+    model="openai/gpt-oss-20b",
     trust_remote_code=True,
+    gpu_memory_utilization = 0.95,
+    # max_num_batched_tokens=4096, # Optional
+    # max_model_len=5000, # Optional
+    # tensor_parallel_size=1 # Optional
 )
-
+ 
 sampling = SamplingParams(
     max_tokens=128,
     temperature=1,
     stop_token_ids=stop_token_ids,
 )
-
+ 
 outputs = llm.generate(
     prompt_token_ids=[prefill_ids],   # batch of size 1
     sampling_params=sampling,
 )
-
+ 
 # vLLM gives you both text and token IDs
 gen = outputs[0].outputs[0]
 text = gen.text
 output_tokens = gen.token_ids  # <-- these are the completion token IDs (no prefill)
-
+ 
 # --- 3) Parse the completion token IDs back into structured Harmony messages ---
 entries = encoding.parse_messages_from_completion_tokens(output_tokens, Role.ASSISTANT)
-
+ 
 # 'entries' is a sequence of structured conversation entries (assistant messages, tool calls, etc.).
 for message in entries:
     print(f"{json.dumps(message.to_dict())}")