Skip to content

Commit de98252

Browse files
authored
Add GPT-OSS model code and config [1/N] (#22327)
Signed-off-by: Woosuk Kwon <[email protected]>
1 parent 796bae0 commit de98252

File tree

4 files changed

+503
-0
lines changed

4 files changed

+503
-0
lines changed

tests/models/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ def check_available_online(
197197
{"6b": "EleutherAI/gpt-j-6b"}),
198198
"GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
199199
{"1b": "EleutherAI/pythia-1.4b"}),
200+
"GptOssForCausalLM": _HfExamplesInfo("openai/gpt-oss-20b"),
200201
"GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
201202
"GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
202203
"GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"), # noqa: E501

vllm/model_executor/models/config.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,34 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
247247
config.max_model_len)
248248

249249

250+
class GptOssConfig(VerifyAndUpdateConfig):
251+
252+
@staticmethod
253+
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
254+
decoding_config = vllm_config.decoding_config
255+
if decoding_config.reasoning_backend == "":
256+
decoding_config.reasoning_backend = "openai"
257+
258+
# Increase the max capture size from 512 to 1024 for performance.
259+
# NOTE(woosuk): This will increase the number of CUDA graphs
260+
# from 67 to 83.
261+
scheduler_config = vllm_config.scheduler_config
262+
if len(scheduler_config.cuda_graph_sizes) == 1:
263+
max_capture_size = scheduler_config.cuda_graph_sizes[0]
264+
# FIXME(woosuk): When using full cuda graph with FA3, the max
265+
# supported size is 992.
266+
if max_capture_size < 1024:
267+
cuda_graph_sizes = [1, 2, 4]
268+
# Step size 8 for small batch sizes
269+
cuda_graph_sizes += [i for i in range(8, 256, 8)]
270+
# Step size 16 for larger batch sizes
271+
cuda_graph_sizes += [i for i in range(256, 1025, 16)]
272+
scheduler_config.cuda_graph_sizes = cuda_graph_sizes
273+
logger.info(
274+
"Overriding max cuda graph capture size to "
275+
"%d for performance.", 1024)
276+
277+
250278
class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
251279

252280
@classmethod
@@ -345,4 +373,5 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
345373
"JinaVLForRanking": JinaVLForSequenceClassificationConfig,
346374
"JambaForSequenceClassification": JambaForSequenceClassificationConfig,
347375
"GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig,
376+
"GptOssForCausalLM": GptOssConfig,
348377
}

0 commit comments

Comments
 (0)