[Feat] Support auto version selection for flash-attn (#4697)

Bobholamovic · web-flow · commit 07c5db6da54e · 2025-11-05T17:04:30.000+08:00
* Support auto version selection for flash-attn

* Use FD 2.2.1

* Update FD version and default params

* Set FD version to 2.3.0rc0

* Refactor
diff --git a/paddlex/inference/genai/configs/paddleocr_vl_09b.py b/paddlex/inference/genai/configs/paddleocr_vl_09b.py
@@ -16,10 +16,12 @@
 def get_config(backend):
     if backend == "fastdeploy":
         return {
-            "gpu-memory-utilization": 0.3,
+            "gpu-memory-utilization": 0.7,
             "max-model-len": 16384,
-            "max-num-batched-tokens": 131072,
+            "max-num-batched-tokens": 16384,
             "max-num-seqs": 256,
+            "workers": 2,
+            "graph-optimization-config": '{"graph_opt_level":0, "use_cudagraph":true}',
         }
     elif backend == "vllm":
         return {
diff --git a/paddlex/paddlex_cli.py b/paddlex/paddlex_cli.py
@@ -36,7 +36,7 @@
     is_dep_available,
     is_paddle2onnx_plugin_available,
 )
-from .utils.env import get_paddle_cuda_version
+from .utils.env import get_gpu_compute_capability, get_paddle_cuda_version
 from .utils.install import install_packages, uninstall_packages
 from .utils.interactive_get_pipeline import interactive_get_pipeline
 from .utils.pipeline_arguments import PIPELINE_ARGUMENTS
@@ -334,7 +334,7 @@ def _install_genai_deps(plugin_types):
 
             if not paddle.device.is_compiled_with_cuda():
                 sys.exit("Currently, only the GPU version of FastDeploy is supported.")
-            cap = paddle.device.cuda.get_device_capability()
+            cap = get_gpu_compute_capability()
             if cap in ((8, 0), (9, 0)):
                 index_url = "https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-80_90/"
             elif cap in ((8, 6), (8, 9)):
@@ -370,7 +370,15 @@ def _install_genai_deps(plugin_types):
             if "vllm" in plugin_type or "sglang" in plugin_type:
                 try:
                     install_packages(["wheel"], constraints="required")
-                    install_packages(["flash-attn == 2.8.2"], constraints="required")
+                    cap = get_gpu_compute_capability()
+                    if cap >= (12, 0):
+                        install_packages(
+                            ["flash-attn == 2.8.3"], constraints="required"
+                        )
+                    else:
+                        install_packages(
+                            ["flash-attn == 2.8.2"], constraints="required"
+                        )
                 except Exception:
                     logging.error("Installation failed", exc_info=True)
                     sys.exit(1)
diff --git a/paddlex/utils/deps.py b/paddlex/utils/deps.py
@@ -306,7 +306,7 @@ def get_genai_fastdeploy_spec(device_type):
     if device_type not in SUPPORTED_DEVICE_TYPES:
         raise ValueError(f"Unsupported device type: {device_type}")
     if device_type == "gpu":
-        return "fastdeploy-gpu == 2.0.3"
+        return "fastdeploy-gpu == 2.3.0rc0"
     else:
         raise AssertionError
 
diff --git a/paddlex/utils/env.py b/paddlex/utils/env.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .deps import is_dep_available, require_deps
+
 
 def get_device_type():
     import paddle
@@ -56,3 +58,22 @@ def get_paddle_cudnn_version():
 
 
 # Should we also support getting the runtime versions of CUDA and cuDNN?
+
+
+def get_gpu_compute_capability():
+    cap = None
+
+    if is_dep_available("paddlepaddle"):
+        import paddle.device
+
+        if paddle.device.is_compiled_with_cuda():
+            cap = paddle.device.cuda.get_device_capability()
+    else:
+        # If Paddle is unavailable, retrieve GPU compute capability from PyTorch instead.
+        require_deps("torch")
+        import torch.cuda
+
+        if torch.cuda.is_available():
+            cap = torch.cuda.get_device_capability()
+
+    return cap