Allow user to specify vision_size for decoder specialization

quic-sanising · quic-sanising · commit bba6252a9213 · 2026-03-18T15:44:51.000-07:00
Signed-off-by: quic-sanising &lt;sanising@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -1047,7 +1047,12 @@ def get_specializations(
             max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels)
 
         vision = []
-        min_vision_size = ctx_len
+        min_vision_size = None
+        user_vision_size = compiler_options.pop("vision_size", None)
+        if user_vision_size:
+            assert user_vision_size < ctx_len, "vision_size must be less than ctx_len"
+        else:
+            min_vision_size = ctx_len
         for h, w in zip(height, width):
             resized_height, resized_width = smart_resize(
                 height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels
@@ -1057,7 +1062,8 @@ def get_specializations(
             grid_width = patch_size * patch_size * temporal_patch_size * channel
             vision_size = grid_height // 4
             grid_height = grid_height * batch_size
-            min_vision_size = min(min_vision_size, vision_size * num_frames)
+            if not user_vision_size:
+                min_vision_size = min(min_vision_size, vision_size * num_frames)
 
             vision.append(
                 {
@@ -1078,7 +1084,7 @@ def get_specializations(
                     "batch_size": 1 if continuous_batching else batch_size,
                     "seq_len": prefill_seq_len,
                     "ctx_len": ctx_len,
-                    "vision_size": min_vision_size,
+                    "vision_size": min_vision_size if not user_vision_size else user_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                     "vision_batch_size": batch_size,
                 }
@@ -1097,7 +1103,7 @@ def get_specializations(
                     "batch_size": full_batch_size if continuous_batching else batch_size,
                     "seq_len": "1",
                     "ctx_len": ctx_len,
-                    "vision_size": min_vision_size,
+                    "vision_size": min_vision_size if not user_vision_size else user_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                     "vision_batch_size": batch_size,
                 }
@@ -1113,7 +1119,7 @@ def get_specializations(
                 "batch_size": 1 if continuous_batching else batch_size,
                 "seq_len": prefill_seq_len,
                 "ctx_len": ctx_len,
-                "vision_size": min_vision_size,
+                "vision_size": min_vision_size if not user_vision_size else user_vision_size,
                 "vision_batch_size": batch_size,
             }
 
@@ -1128,7 +1134,7 @@ def get_specializations(
                 "batch_size": full_batch_size if continuous_batching else batch_size,
                 "seq_len": 1,
                 "ctx_len": ctx_len,
-                "vision_size": min_vision_size,
+                "vision_size": min_vision_size if not user_vision_size else user_vision_size,
                 "vision_batch_size": batch_size,
             }
 
diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
@@ -80,6 +80,7 @@
     widths = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920]
     heights = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080]
     num_frames = [177, 139, 78, 64, 37, 30, 20, 16, 16, 7, 7]
+    user_vision_size = 9216
 
     qeff_model.compile(
         batch_size=batch_size,
@@ -94,6 +95,7 @@
             "min_pixels": 4 * 28 * 28,
             "max_pixels": 16384 * 28 * 28,
         },
+        vision_size=user_vision_size,
         mxfp6_matmul=True,
         mxint8_kv_cache=True,
         aic_enable_depth_first=True,