diff --git a/articles/gpt-oss/run-vllm.md b/articles/gpt-oss/run-vllm.md index e534ae5302..276ed88ea3 100644 --- a/articles/gpt-oss/run-vllm.md +++ b/articles/gpt-oss/run-vllm.md @@ -170,6 +170,11 @@ uv pip install openai-harmony Afterwards you can use harmony to encode and parse the tokens generated by vLLM’s generate function. ```py +# source .oss/bin/activate + +import os +os.environ["VLLM_USE_FLASHINFER_SAMPLER"] = "0" + import json from openai_harmony import ( HarmonyEncodingName, @@ -180,12 +185,13 @@ from openai_harmony import ( SystemContent, DeveloperContent, ) - + from vllm import LLM, SamplingParams +import os # --- 1) Render the prefill with Harmony --- encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) - + convo = Conversation.from_messages( [ Message.from_role_and_content(Role.SYSTEM, SystemContent.new()), @@ -196,37 +202,41 @@ convo = Conversation.from_messages( Message.from_role_and_content(Role.USER, "What is the weather like in SF?"), ] ) - + prefill_ids = encoding.render_conversation_for_completion(convo, Role.ASSISTANT) - + # Harmony stop tokens (pass to sampler so they won't be included in output) stop_token_ids = encoding.stop_tokens_for_assistant_actions() - + # --- 2) Run vLLM with prefill --- llm = LLM( - model="openai/gpt-oss-120b", + model="openai/gpt-oss-20b", trust_remote_code=True, + gpu_memory_utilization = 0.95, + # max_num_batched_tokens=4096, # Optional + # max_model_len=5000, # Optional + # tensor_parallel_size=1 # Optional ) - + sampling = SamplingParams( max_tokens=128, temperature=1, stop_token_ids=stop_token_ids, ) - + outputs = llm.generate( prompt_token_ids=[prefill_ids], # batch of size 1 sampling_params=sampling, ) - + # vLLM gives you both text and token IDs gen = outputs[0].outputs[0] text = gen.text output_tokens = gen.token_ids # <-- these are the completion token IDs (no prefill) - + # --- 3) Parse the completion token IDs back into structured Harmony messages --- entries = encoding.parse_messages_from_completion_tokens(output_tokens, Role.ASSISTANT) - + # 'entries' is a sequence of structured conversation entries (assistant messages, tool calls, etc.). for message in entries: print(f"{json.dumps(message.to_dict())}")