fix: enable cuda_use_flash_attention2 for PictureDescriptionVlmModel (#1496)

zcox10 · web-flow · commit cc453961a919 · 2025-04-30T08:02:52.000+02:00
fix: enable use_cuda_flash_attention2 for PictureDescriptionVlmModel

Signed-off-by: Zach Cox &lt;zach.s.cox@gmail.com&gt;
diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py
@@ -57,7 +57,10 @@ def __init__(
                 artifacts_path,
                 torch_dtype=torch.bfloat16,
                 _attn_implementation=(
-                    "flash_attention_2" if self.device.startswith("cuda") else "eager"
+                    "flash_attention_2"
+                    if self.device.startswith("cuda")
+                    and accelerator_options.cuda_use_flash_attention2
+                    else "eager"
                 ),
             ).to(self.device)