Safeq qwen test (#1900)

Matvezy · grzegorz-roboflow · web-flow · commit 3bbbc6549841 · 2026-01-22T12:17:00.000+01:00
* add safe flash attn check

* cropt test

---------

Co-authored-by: Grzegorz Klimaszewski &lt;166530809+grzegorz-roboflow@users.noreply.github.com&gt;
diff --git a/inference_models/inference_models/models/qwen3vl/qwen3vl_hf.py b/inference_models/inference_models/models/qwen3vl/qwen3vl_hf.py
@@ -21,6 +21,22 @@
 )
 
 
+def _get_qwen3vl_attn_implementation(device: torch.device) -> str:
+    """Use flash_attention_2 if available, otherwise eager.
+
+    SDPA has dtype mismatch issues with some transformers versions.
+    """
+    if is_flash_attn_2_available() and device and "cuda" in str(device):
+        # Verify flash_attn can actually be imported (not just installed)
+        try:
+            import flash_attn  # noqa: F401
+
+            return "flash_attention_2"
+        except ImportError:
+            pass
+    return "eager"
+
+
 class Qwen3VLHF:
     default_dtype = torch.bfloat16
 
@@ -53,11 +69,7 @@ def from_pretrained(
 
         dtype = cls.default_dtype
 
-        attn_implementation = (
-            "flash_attention_2"
-            if (is_flash_attn_2_available() and device and "cuda" in str(device))
-            else "eager"
-        )
+        attn_implementation = _get_qwen3vl_attn_implementation(device)
 
         if os.path.exists(adapter_config_path):
             # Has adapter - load base model then apply LoRA