fix for cpu-only config server

shihaobai · shihaobai · commit 66858467eb3b · 2025-05-16T15:04:26.000+08:00
diff --git a/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad.py b/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad.py
@@ -4,9 +4,10 @@
 import triton.language as tl
 import math
 import torch.nn.functional as F
+from lightllm.utils.device_utils import get_cuda_device_name, get_device_capability
 
-TESLA = "Tesla" in torch.cuda.get_device_name(0)
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+TESLA = "Tesla" in get_cuda_device_name()
+CUDA_CAPABILITY = get_device_capability()
 
 
 @triton.jit
diff --git a/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_fp8.py b/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_fp8.py
@@ -4,9 +4,10 @@
 import triton.language as tl
 import math
 import torch.nn.functional as F
+from lightllm.utils.device_utils import get_cuda_device_name, get_device_capability
 
-TESLA = "Tesla" in torch.cuda.get_device_name(0)
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+TESLA = "Tesla" in get_cuda_device_name()
+CUDA_CAPABILITY = get_device_capability()
 
 
 @triton.jit
diff --git a/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_with_v.py b/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_with_v.py
@@ -4,9 +4,10 @@
 import triton.language as tl
 import math
 import torch.nn.functional as F
+from lightllm.utils.device_utils import get_cuda_device_name, get_device_capability
 
-TESLA = "Tesla" in torch.cuda.get_device_name(0)
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+TESLA = "Tesla" in get_cuda_device_name()
+CUDA_CAPABILITY = get_device_capability()
 
 
 @triton.jit
diff --git a/lightllm/models/deepseek2/triton_kernel/sample_kv.py b/lightllm/models/deepseek2/triton_kernel/sample_kv.py
@@ -3,8 +3,10 @@
 import triton
 import triton.language as tl
 
-TESLA = "Tesla" in torch.cuda.get_device_name(0)
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+from lightllm.utils.device_utils import get_cuda_device_name, get_device_capability
+
+TESLA = "Tesla" in get_cuda_device_name()
+CUDA_CAPABILITY = get_device_capability()
 
 
 @triton.jit
diff --git a/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py b/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
@@ -7,7 +7,9 @@
 import math
 import torch.nn.functional as F
 
-TESLA = "Tesla" in torch.cuda.get_device_name(0)
+from lightllm.utils.device_utils import get_cuda_device_name
+
+TESLA = "Tesla" in get_cuda_device_name()
 
 
 @triton.jit
diff --git a/lightllm/models/phi3/triton_kernel/context_flashattention_nopad.py b/lightllm/models/phi3/triton_kernel/context_flashattention_nopad.py
@@ -5,7 +5,9 @@
 import math
 import torch.nn.functional as F
 
-TESLA = "Tesla" in torch.cuda.get_device_name(0)
+from lightllm.utils.device_utils import get_cuda_device_name
+
+TESLA = "Tesla" in get_cuda_device_name()
 
 
 @triton.jit
diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -5,12 +5,13 @@
 import math
 import torch.nn.functional as F
 
-TESLA = "Tesla" in torch.cuda.get_device_name(0)
+from lightllm.utils.device_utils import get_cuda_device_name, get_device_capability
+
 HOPPER = (
-    "H100" in torch.cuda.get_device_name(0)
-    or "H200" in torch.cuda.get_device_name(0)
-    or "H800" in torch.cuda.get_device_name(0)
-    or "Hopper" in torch.cuda.get_device_name(0)
+    "H100" in get_cuda_device_name()
+    or "H200" in get_cuda_device_name()
+    or "H800" in get_cuda_device_name()
+    or "Hopper" in get_cuda_device_name()
 )
 
 
diff --git a/lightllm/utils/device_utils.py b/lightllm/utils/device_utils.py
@@ -1,5 +1,6 @@
 import os
 import time
+import torch
 import shutil
 import subprocess
 from functools import lru_cache
@@ -8,6 +9,20 @@
 logger = init_logger(__name__)
 
 
+@lru_cache(maxsize=None)
+def get_cuda_device_name():
+    if not torch.cuda.is_available():
+        return ""
+    return torch.cuda.get_device_name(0)
+
+
+@lru_cache(maxsize=None)
+def get_device_capability():
+    if not torch.cuda.is_available():
+        return (-1, -1)
+    return torch.cuda.get_device_capability()
+
+
 @lru_cache(maxsize=None)
 def get_device_sm_count():
     import triton
diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py
@@ -24,6 +24,8 @@ def get_unique_server_name():
 
 
 def set_cuda_arch(args):
+    if not torch.cuda.is_available():
+        return
     if args.enable_flashinfer_prefill or args.enable_flashinfer_decode:
         capability = torch.cuda.get_device_capability()
         arch = f"{capability[0]}.{capability[1]}"