File tree Expand file tree Collapse file tree 2 files changed +12
-2
lines changed Expand file tree Collapse file tree 2 files changed +12
-2
lines changed Original file line number Diff line number Diff line change 1- pytest == 8.3.2
2- GPUtil == 1.4.0
31docker == 7.1.0
2+ GPUtil == 1.4.0
3+ pytest == 8.3.2
4+ nvidia-ml-py == 12.560.30
45transformers == 4.44.2
Original file line number Diff line number Diff line change 77import pytest
88import requests
99
10+ import pynvml
1011from docker .types .containers import DeviceRequest
1112from transformers import AutoTokenizer
1213
@@ -43,6 +44,14 @@ def test_text_generation_inference(
4344
4445 client = docker .from_env ()
4546
47+ # If the GPU compute capability is lower than 8.0 (Ampere), then set `USE_FLASH_ATTENTION=false`
48+ pynvml .nvmlInit ()
49+ handle = pynvml .nvmlDeviceGetHandleByIndex (0 )
50+ compute_capability = pynvml .nvmlDeviceGetCudaComputeCapability (handle )
51+ if compute_capability [0 ] < 8 :
52+ text_generation_launcher_kwargs ["USE_FLASH_ATTENTION" ] = "false"
53+ pynvml .nvmlShutdown ()
54+
4655 logging .info (
4756 f"Starting container for { text_generation_launcher_kwargs .get ('MODEL_ID' , None )} ..."
4857 )
You can’t perform that action at this time.
0 commit comments