Addressing Review Comments 2

qcdipankar · qcdipankar · commit 450c8d635c1e · 2026-03-11T03:26:40.000Z
Signed-off-by: Dipankar Sarkar &lt;dipankar@qti.qualcomm.com&gt;
diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py
@@ -252,29 +252,10 @@ def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) -
 
             # Process image and text
             inputs = self._processor(images=image, text=prompt, return_tensors="pt")
-            if (
-                hasattr(self._qeff_model.model.config, "model_type")
-                and self._qeff_model.model.config.model_type == "qwen2_5_vl"
-            ):
-                inputs = self._qeff_model.model.prepare_inputs_for_generation(
-                    inputs=inputs, prefill_seq_len=prefill_seq_len, batch_size=inputs["input_ids"].shape[0]
-                )
-
-            if (
-                hasattr(self._qeff_model.model.config, "model_type")
-                and self._qeff_model.model.config.model_type == "qwen3_vl_moe"
-            ):
+            if (hasattr(self._qeff_model.model.config, "model_type")and self._qeff_model.model.config.model_type in {"qwen2_5_vl", "qwen3_vl_moe", "qwen3_vl"}):
                 inputs = self._qeff_model.model.prepare_inputs_for_generation(
-                    inputs=inputs, prefill_seq_len=prefill_seq_len, batch_size=inputs["input_ids"].shape[0]
-                )
-
-            if (
-                hasattr(self._qeff_model.model.config, "model_type")
-                and self._qeff_model.model.config.model_type == "qwen3_vl"
-            ):
-                inputs = self._qeff_model.model.prepare_inputs_for_generation(
-                    inputs=inputs, prefill_seq_len=prefill_seq_len, batch_size=inputs["input_ids"].shape[0]
-                )
+                        inputs=inputs, prefill_seq_len=prefill_seq_len, batch_size=inputs["input_ids"].shape[0]
+                    )
 
             # Convert to float32 if needed
             if "pixel_values" in inputs:
@@ -426,7 +407,7 @@ def setup_vision_buffers(self):
             buffers = {}
             for output_name, shape in shapes.items():
                 # Create placeholder with appropriate dtype
-                if "vision_embeds" or "deepstack_features" in output_name:
+                if "vision_embeds" in output_name or "deepstack_features" in output_name:
                     buffers[output_name] = np.zeros(shape, dtype=np.float16)
                 else:
                     buffers[output_name] = np.zeros(shape, dtype=np.float32)
diff --git a/QEfficient/generation/vlm_generation.py b/QEfficient/generation/vlm_generation.py
@@ -146,15 +146,7 @@ def __init__(
         )
 
         # Vision-specific initialization
-        self.is_qwen2_5_vl = (
-            hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl"
-        )
-        self.is_qwen3_vl_moe = (
-            hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen3_vl_moe"
-        )
-        self.is_qwen3_vl = (
-            hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen3_vl"
-        )
+        self.is_qwen_vl = (hasattr(qeff_model.model.config, "model_type")and qeff_model.model.config.model_type in {"qwen2_5_vl", "qwen3_vl_moe", "qwen3_vl"})
         self.qeff_model = qeff_model
         self.processor = processor
         self.tokenizer = tokenizer
@@ -262,37 +254,12 @@ def run_prefill_for_all_inputs(self, prompt_queue, generation_len):
             outputs, position_ids, generation_len = self.run_prefill(
                 next_prompt, generation_len, decode_batch_id=np.array(decode_batch_id, dtype=np.int64).reshape(1, 1)
             )
-            if self.is_qwen2_5_vl:
-                _ = self.update_decode_inputs_qwen2_5_vl(outputs, position_ids, generation_len, decode_batch_id)
-            elif self.is_qwen3_vl_moe:
-                _ = self.update_decode_inputs_qwen3_vl_moe(outputs, position_ids, generation_len, decode_batch_id)
-            elif self.is_qwen3_vl:
-                _ = self.update_decode_inputs_qwen3_vl_moe(outputs, position_ids, generation_len, decode_batch_id)
+            if self.is_qwen_vl:
+                _ = self.update_decode_inputs_qwen_vl(outputs, position_ids, generation_len, decode_batch_id)
             else:
                 _ = self.update_decode_input(outputs, position_ids, generation_len, decode_batch_id)
 
-    def update_decode_inputs_qwen2_5_vl(self, outputs, position_ids, generation_len, decode_batch_id=None):
-        """
-        Updates the decode input with the generated values.
-        Args:
-            outputs (dict): The outputs of the model.
-            position_ids (array): The position IDs.
-            generation_len (int): The generation length.
-            decode_batch_id (int, optional): The decode batch ID. If None, all values are updated. Defaults to None.
-
-        Returns:
-            next_token_id (array): The next token ID.
-        """
-        next_token_id = self._fetch_next_token_id(outputs)
-
-        # Store the generated values.
-        self.decode_input_ids[decode_batch_id or slice(None)] = next_token_id
-        self.decode_pos_ids[:, decode_batch_id] = position_ids.squeeze(1)
-        self.generated_ids[decode_batch_id or slice(None), 0] = next_token_id.squeeze(1)
-        self.generation_len[decode_batch_id or slice(None)] = generation_len
-        return next_token_id
-
-    def update_decode_inputs_qwen3_vl_moe(self, outputs, position_ids, generation_len, decode_batch_id=None):
+    def update_decode_inputs_qwen_vl(self, outputs, position_ids, generation_len, decode_batch_id=None):
         """
         Updates the decode input with the generated values.
         Args:
@@ -313,26 +280,6 @@ def update_decode_inputs_qwen3_vl_moe(self, outputs, position_ids, generation_le
         self.generation_len[decode_batch_id or slice(None)] = generation_len
         return next_token_id
 
-    def update_decode_inputs_qwen3_vl(self, outputs, position_ids, generation_len, decode_batch_id=None):
-        """
-        Updates the decode input with the generated values.
-        Args:
-            outputs (dict): The outputs of the model.
-            position_ids (array): The position IDs.
-            generation_len (int): The generation length.
-            decode_batch_id (int, optional): The decode batch ID. If None, all values are updated. Defaults to None.
-
-        Returns:
-            next_token_id (array): The next token ID.
-        """
-        next_token_id = self._fetch_next_token_id(outputs)
-
-        # Store the generated values.
-        self.decode_input_ids[decode_batch_id or slice(None)] = next_token_id
-        self.decode_pos_ids[:, decode_batch_id] = position_ids.squeeze(1)
-        self.generated_ids[decode_batch_id or slice(None), 0] = next_token_id.squeeze(1)
-        self.generation_len[decode_batch_id or slice(None)] = generation_len
-        return next_token_id
 
     def _execute_chunked_prefill(
         self,
@@ -632,11 +579,7 @@ def _generate_continuous_batching(self, vision_prompts, generation_len, stream,
         max_gen_length = self._ctx_len if not generation_len else max(self._ctx_len, generation_len)
 
         self.initialize_decode_inputs(num_prompts, execution_batch_size, max_gen_length)
-        if self.is_qwen2_5_vl:
-            self.decode_pos_ids = np.zeros((4, execution_batch_size, 1), np.int64)
-        if self.is_qwen3_vl_moe:
-            self.decode_pos_ids = np.zeros((4, execution_batch_size, 1), np.int64)
-        if self.is_qwen3_vl:
+        if self.is_qwen_vl:
             self.decode_pos_ids = np.zeros((4, execution_batch_size, 1), np.int64)
         # Create prompt queue
         prompt_queue = deque(vision_prompts)
@@ -744,16 +687,8 @@ def run_prefill_for_all_inputs_with_cached_vision(self, prompt_queue, generation
                 generation_len_final = self._fetch_generation_len(generation_len, max_gen_len)
 
                 # Update decode inputs
-                if self.is_qwen2_5_vl:
-                    self.update_decode_inputs_qwen2_5_vl(
-                        outputs, position_ids_decode, generation_len_final, decode_batch_id
-                    )
-                elif self.is_qwen3_vl_moe:
-                    self.update_decode_inputs_qwen3_vl_moe(
-                        outputs, position_ids_decode, generation_len_final, decode_batch_id
-                    )
-                elif self.is_qwen3_vl:
-                    self.update_decode_inputs_qwen3_vl(
+                if self.is_qwen_vl:
+                    self.update_decode_inputs_qwen_vl(
                         outputs, position_ids_decode, generation_len_final, decode_batch_id
                     )
                 else:
diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
@@ -192,7 +192,6 @@ def update(
             A tuple containing the updated key and value states.
         """
         # Update the cache
-        # if not self.is_initialized:
 
         if self.keys is None:
             self.keys = key_states
@@ -336,17 +335,13 @@ def __init__(
                 layer_class_to_replicate=QEffDynamicLayer,
                 offloading=offloading,
                 offload_only_non_sliding=offload_only_non_sliding,
-                # args=args,
-                # kwargs=kwargs,
             )
         else:
             Cache.__init__(
                 self,
                 layers=layers,
                 offloading=offloading,
                 offload_only_non_sliding=offload_only_non_sliding,
-                # args=args,
-                # kwargs=kwargs,
             )
 
         if ddp_cache_data is not None:
@@ -434,18 +429,7 @@ def update3D(
         self.append_new_layers(layer_idx)
         return self.layers[layer_idx].update3D(key_states, value_states, cache_kwargs)
 
-    # def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
-    #     """Returns the sequence length of the cached states. A layer index can be optionally passed."""
-    #     # TODO: deprecate this function in favor of `cache_position`
-    #     breakpoint()
-    #     is_empty_layer = (
-    #         len(self.key_cache) == 0  # no cache in any layer
-    #         or len(self.key_cache) <= layer_idx  # skipped `layer_idx` and hasn't run a layer with cache after it
-    #         or len(self.key_cache[layer_idx]) == 0  # the layer has no cache
-    #     )
-    #     layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
-    #     return layer_seq_length
-
+   
 
 class QEffEncoderDecoderCache(EncoderDecoderCache):
     """