[SW-238029] [1.22]Fix max_batch_size handling - Lllama perf degradation fix (#1828)

jiminha · iboiko-habana · web-flow · commit a6a0ba311815 · 2025-08-28T15:12:31.000+02:00
Llama Perf degradation seen with Gemma3 suport: #1616. : max_batch_size was initialized incorrectly for the profile_run due to mm_registry checking instead of actual multimodal models. Fix to only initialized to 1 when multimodal(mrope or mm_optimized) model is in use. ## Test Result Llama v3.1 70B 2048/128 BF16 2xcard - perf drop 170 tps to 150 tps. With this fix, it's back to 170tps --------- Co-authored-by: Iryna Boiko <iboiko@habana.ai>
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -1491,11 +1491,10 @@ def move_to_device(self, tensor):
                                                        non_blocking=True)
 
     def add_vision_buckets_to_mrope_mm_optimized(self):
-        if self.mm_registry is not None:
-            model = self.get_model()
-            self.is_mm_optimized = is_mm_optimized(model)
-            if self.model_is_mrope or self.is_mm_optimized:
-                model.vision_buckets = VisionBuckets(self.is_mm_optimized)
+        model = self.get_model()
+        self.is_mm_optimized = is_mm_optimized(model)
+        if self.model_is_mrope or self.is_mm_optimized:
+            model.vision_buckets = VisionBuckets(self.is_mm_optimized)
 
     def _prepare_prompt(
         self,
@@ -2804,10 +2803,10 @@ def profile_run(self) -> None:
         max_seq_len = self.bucketing_manager.get_max_prompt_shape()
         max_batch_size = min(self.max_num_seqs,
                              self.max_num_batched_tokens // max_seq_len)
-        # Using batch_size 1 is profile multimodal models
-        max_batch_size = max_batch_size if self.mm_registry is None else 1
 
         if self.model_is_mrope or self.is_mm_optimized:
+            # Using batch_size 1 is profile multimodal models
+            max_batch_size = 1
             model = self.get_model()
             self.multimodal_buckets = model.vision_buckets.multimodal_buckets
             logger_msg = "Multimodal bucket : " + str(self.multimodal_buckets)