@@ -247,6 +247,34 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
247
247
config .max_model_len )
248
248
249
249
250
+ class GptOssConfig (VerifyAndUpdateConfig ):
251
+
252
+ @staticmethod
253
+ def verify_and_update_config (vllm_config : "VllmConfig" ) -> None :
254
+ decoding_config = vllm_config .decoding_config
255
+ if decoding_config .reasoning_backend == "" :
256
+ decoding_config .reasoning_backend = "openai"
257
+
258
+ # Increase the max capture size from 512 to 1024 for performance.
259
+ # NOTE(woosuk): This will increase the number of CUDA graphs
260
+ # from 67 to 83.
261
+ scheduler_config = vllm_config .scheduler_config
262
+ if len (scheduler_config .cuda_graph_sizes ) == 1 :
263
+ max_capture_size = scheduler_config .cuda_graph_sizes [0 ]
264
+ # FIXME(woosuk): When using full cuda graph with FA3, the max
265
+ # supported size is 992.
266
+ if max_capture_size < 1024 :
267
+ cuda_graph_sizes = [1 , 2 , 4 ]
268
+ # Step size 8 for small batch sizes
269
+ cuda_graph_sizes += [i for i in range (8 , 256 , 8 )]
270
+ # Step size 16 for larger batch sizes
271
+ cuda_graph_sizes += [i for i in range (256 , 1025 , 16 )]
272
+ scheduler_config .cuda_graph_sizes = cuda_graph_sizes
273
+ logger .info (
274
+ "Overriding max cuda graph capture size to "
275
+ "%d for performance." , 1024 )
276
+
277
+
250
278
class HybridAttentionMambaModelConfig (VerifyAndUpdateConfig ):
251
279
252
280
@classmethod
@@ -345,4 +373,5 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
345
373
"JinaVLForRanking" : JinaVLForSequenceClassificationConfig ,
346
374
"JambaForSequenceClassification" : JambaForSequenceClassificationConfig ,
347
375
"GraniteMoeHybridForCausalLM" : GraniteMoeHybridModelConfig ,
376
+ "GptOssForCausalLM" : GptOssConfig ,
348
377
}
0 commit comments