|
1 | 1 | import os |
2 | | -from functools import lru_cache |
| 2 | +import time |
| 3 | +import shutil |
3 | 4 | import subprocess |
| 5 | +from functools import lru_cache |
| 6 | +from lightllm.utils.log_utils import init_logger |
| 7 | + |
| 8 | +logger = init_logger(__name__) |
4 | 9 |
|
5 | 10 |
|
6 | 11 | @lru_cache(maxsize=None) |
@@ -99,3 +104,110 @@ def has_nvlink(): |
99 | 104 | except subprocess.CalledProcessError: |
100 | 105 | # If there's an error (e.g., nvidia-smi is not installed or another issue), assume no NVLink |
101 | 106 | return False |
| 107 | + |
| 108 | + |
| 109 | +def is_mps_running(verbose=False): |
| 110 | + result = subprocess.run( |
| 111 | + "ps -ef | grep '[n]vidia-cuda-mps-control'", |
| 112 | + shell=True, |
| 113 | + stdout=subprocess.PIPE, |
| 114 | + stderr=subprocess.PIPE, |
| 115 | + text=True, |
| 116 | + ) |
| 117 | + return result.returncode == 0 |
| 118 | + |
| 119 | + |
| 120 | +def stop_mps(): |
| 121 | + if is_mps_running(): |
| 122 | + result = subprocess.run("echo quit | nvidia-cuda-mps-control", shell=True) |
| 123 | + logger.info("Stopping MPS...") |
| 124 | + if result.returncode == 0: |
| 125 | + logger.info("MPS stopped successfully.") |
| 126 | + else: |
| 127 | + logger.warning("Failed to stop MPS.") |
| 128 | + else: |
| 129 | + logger.info("MPS is not running, no need to stop.") |
| 130 | + |
| 131 | + |
| 132 | +def enable_mps(): |
| 133 | + if is_mps_running(): |
| 134 | + logger.info("MPS is already running, no need to start.") |
| 135 | + return |
| 136 | + |
| 137 | + ret = os.system("nvidia-cuda-mps-control -d") |
| 138 | + |
| 139 | + time.sleep(10) |
| 140 | + if ret != 0: |
| 141 | + logger.warning("Failed to start MPS.") |
| 142 | + return |
| 143 | + if is_mps_running(): |
| 144 | + logger.info("MPS started successfully.") |
| 145 | + return |
| 146 | + |
| 147 | + |
| 148 | +def get_gpu_compute_mode(gpu_index=0): |
| 149 | + try: |
| 150 | + if not shutil.which("nvidia-smi"): |
| 151 | + logger.warning("nvidia-smi not found in PATH.") |
| 152 | + return None |
| 153 | + |
| 154 | + cmd = ["nvidia-smi", "-i", str(gpu_index), "--query-gpu=compute_mode", "--format=csv,noheader"] |
| 155 | + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
| 156 | + |
| 157 | + if result.returncode != 0: |
| 158 | + logger.warning(f"Failed to query compute mode: {result.stderr.strip()}") |
| 159 | + return None |
| 160 | + |
| 161 | + mode = result.stdout.strip() |
| 162 | + return mode |
| 163 | + |
| 164 | + except Exception as e: |
| 165 | + logger.warning(f"Exception occurred while checking GPU compute mode: {e}") |
| 166 | + return None |
| 167 | + |
| 168 | + |
| 169 | +def set_gpu_exclusive_mode(gpu_index=0): |
| 170 | + logger.info(f"Setting GPU {gpu_index} to EXCLUSIVE_PROCESS mode...") |
| 171 | + result = subprocess.run( |
| 172 | + ["nvidia-smi", "-i", str(gpu_index), "-c", "EXCLUSIVE_PROCESS"], |
| 173 | + stdout=subprocess.PIPE, |
| 174 | + stderr=subprocess.PIPE, |
| 175 | + text=True, |
| 176 | + ) |
| 177 | + if result.returncode == 0: |
| 178 | + logger.info(f"GPU {gpu_index} set to EXCLUSIVE_PROCESS mode.") |
| 179 | + return True |
| 180 | + else: |
| 181 | + logger.warning(f"Failed to set EXCLUSIVE_PROCESS mode: {result.stderr.strip()}") |
| 182 | + return False |
| 183 | + |
| 184 | + |
| 185 | +def set_gpu_default_mode(gpu_index=0): |
| 186 | + logger.info(f"Setting GPU {gpu_index} to DEFAULT mode...") |
| 187 | + result = subprocess.run( |
| 188 | + ["nvidia-smi", "-i", str(gpu_index), "-c", "DEFAULT"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True |
| 189 | + ) |
| 190 | + if result.returncode == 0: |
| 191 | + logger.info(f"GPU {gpu_index} set to DEFAULT mode.") |
| 192 | + return True |
| 193 | + else: |
| 194 | + logger.warning(f"Failed to set DEFAULT mode: {result.stderr.strip()}") |
| 195 | + return False |
| 196 | + |
| 197 | + |
| 198 | +def set_sm_limit(percent: int, gpu_index=0): |
| 199 | + """ |
| 200 | + Sets CUDA_MPS_ACTIVE_THREAD_PERCENTAGE to the given value if the GPU is in EXCLUSIVE_PROCESS mode. |
| 201 | + """ |
| 202 | + if not (1 <= percent <= 100): |
| 203 | + logger.error("SM usage percentage must be between 1 and 100.") |
| 204 | + return False |
| 205 | + |
| 206 | + mode = get_gpu_compute_mode(gpu_index) |
| 207 | + if mode != "Exclusive_Process": |
| 208 | + logger.warning(f"Cannot set SM limit. GPU {gpu_index} is in '{mode}' mode, not 'Exclusive_Process'.") |
| 209 | + return False |
| 210 | + |
| 211 | + os.environ["CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"] = str(percent) |
| 212 | + logger.info(f"Set CUDA_MPS_ACTIVE_THREAD_PERCENTAGE to {percent}% for GPU {gpu_index}.") |
| 213 | + return True |
0 commit comments