Skip to content

Commit 07c5db6

Browse files
authored
[Feat] Support auto version selection for flash-attn (#4697)
* Support auto version selection for flash-attn * Use FD 2.2.1 * Update FD version and default params * Set FD version to 2.3.0rc0 * Refactor
1 parent 6f4a426 commit 07c5db6

File tree

4 files changed

+37
-6
lines changed

4 files changed

+37
-6
lines changed

paddlex/inference/genai/configs/paddleocr_vl_09b.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,12 @@
1616
def get_config(backend):
1717
if backend == "fastdeploy":
1818
return {
19-
"gpu-memory-utilization": 0.3,
19+
"gpu-memory-utilization": 0.7,
2020
"max-model-len": 16384,
21-
"max-num-batched-tokens": 131072,
21+
"max-num-batched-tokens": 16384,
2222
"max-num-seqs": 256,
23+
"workers": 2,
24+
"graph-optimization-config": '{"graph_opt_level":0, "use_cudagraph":true}',
2325
}
2426
elif backend == "vllm":
2527
return {

paddlex/paddlex_cli.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
is_dep_available,
3737
is_paddle2onnx_plugin_available,
3838
)
39-
from .utils.env import get_paddle_cuda_version
39+
from .utils.env import get_gpu_compute_capability, get_paddle_cuda_version
4040
from .utils.install import install_packages, uninstall_packages
4141
from .utils.interactive_get_pipeline import interactive_get_pipeline
4242
from .utils.pipeline_arguments import PIPELINE_ARGUMENTS
@@ -334,7 +334,7 @@ def _install_genai_deps(plugin_types):
334334

335335
if not paddle.device.is_compiled_with_cuda():
336336
sys.exit("Currently, only the GPU version of FastDeploy is supported.")
337-
cap = paddle.device.cuda.get_device_capability()
337+
cap = get_gpu_compute_capability()
338338
if cap in ((8, 0), (9, 0)):
339339
index_url = "https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-80_90/"
340340
elif cap in ((8, 6), (8, 9)):
@@ -370,7 +370,15 @@ def _install_genai_deps(plugin_types):
370370
if "vllm" in plugin_type or "sglang" in plugin_type:
371371
try:
372372
install_packages(["wheel"], constraints="required")
373-
install_packages(["flash-attn == 2.8.2"], constraints="required")
373+
cap = get_gpu_compute_capability()
374+
if cap >= (12, 0):
375+
install_packages(
376+
["flash-attn == 2.8.3"], constraints="required"
377+
)
378+
else:
379+
install_packages(
380+
["flash-attn == 2.8.2"], constraints="required"
381+
)
374382
except Exception:
375383
logging.error("Installation failed", exc_info=True)
376384
sys.exit(1)

paddlex/utils/deps.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ def get_genai_fastdeploy_spec(device_type):
306306
if device_type not in SUPPORTED_DEVICE_TYPES:
307307
raise ValueError(f"Unsupported device type: {device_type}")
308308
if device_type == "gpu":
309-
return "fastdeploy-gpu == 2.0.3"
309+
return "fastdeploy-gpu == 2.3.0rc0"
310310
else:
311311
raise AssertionError
312312

paddlex/utils/env.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
from .deps import is_dep_available, require_deps
16+
1517

1618
def get_device_type():
1719
import paddle
@@ -56,3 +58,22 @@ def get_paddle_cudnn_version():
5658

5759

5860
# Should we also support getting the runtime versions of CUDA and cuDNN?
61+
62+
63+
def get_gpu_compute_capability():
64+
cap = None
65+
66+
if is_dep_available("paddlepaddle"):
67+
import paddle.device
68+
69+
if paddle.device.is_compiled_with_cuda():
70+
cap = paddle.device.cuda.get_device_capability()
71+
else:
72+
# If Paddle is unavailable, retrieve GPU compute capability from PyTorch instead.
73+
require_deps("torch")
74+
import torch.cuda
75+
76+
if torch.cuda.is_available():
77+
cap = torch.cuda.get_device_capability()
78+
79+
return cap

0 commit comments

Comments
 (0)