Add nvidia-ml-py to set USE_FLASH_ATTENTION based on compute cap

alvarobartt · alvarobartt · commit c99e0ed78916 · 2024-09-02T12:11:35.000+02:00
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -1,4 +1,5 @@
-pytest==8.3.2
-GPUtil==1.4.0
 docker==7.1.0
+GPUtil==1.4.0
+pytest==8.3.2
+nvidia-ml-py==12.560.30
 transformers==4.44.2
diff --git a/tests/tgi/test_tgi.py b/tests/tgi/test_tgi.py
@@ -7,6 +7,7 @@
 import pytest
 import requests
 
+import pynvml
 from docker.types.containers import DeviceRequest
 from transformers import AutoTokenizer
 
@@ -43,6 +44,14 @@ def test_text_generation_inference(
 
     client = docker.from_env()
 
+    # If the GPU compute capability is lower than 8.0 (Ampere), then set `USE_FLASH_ATTENTION=false`
+    pynvml.nvmlInit()
+    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+    compute_capability = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+    if compute_capability[0] < 8:
+        text_generation_launcher_kwargs["USE_FLASH_ATTENTION"] = "false"
+    pynvml.nvmlShutdown()
+
     logging.info(
         f"Starting container for {text_generation_launcher_kwargs.get('MODEL_ID', None)}..."
     )