|
1 | 1 | import asyncio |
2 | 2 | import datetime |
| 3 | +import fcntl |
3 | 4 | import os |
4 | 5 | import random |
5 | 6 | import shlex |
@@ -64,6 +65,7 @@ def __init__( |
64 | 65 | self._stop_time = datetime.datetime.now() + datetime.timedelta(minutes=self._config.auto_clear_time) |
65 | 66 | self._check_stop_task = None |
66 | 67 | self._container_name = None |
| 68 | + self._resolved_gpu_spec: str | None = None |
67 | 69 | self._service_status = PersistedServiceStatus() |
68 | 70 | if self._config.container_name: |
69 | 71 | self.set_container_name(self._config.container_name) |
@@ -169,6 +171,96 @@ def _build_runtime_args(self) -> list[str]: |
169 | 171 | ] |
170 | 172 | return ["--privileged"] |
171 | 173 |
|
| 174 | + def _detect_gpu_count(self) -> int: |
| 175 | + """Detect the number of GPUs visible on the Docker host.""" |
| 176 | + try: |
| 177 | + out = subprocess.check_output( |
| 178 | + ["nvidia-smi", "--list-gpus"], |
| 179 | + text=True, |
| 180 | + stderr=subprocess.DEVNULL, |
| 181 | + ) |
| 182 | + return len([line for line in out.splitlines() if line.strip()]) |
| 183 | + except Exception: |
| 184 | + return 0 |
| 185 | + |
| 186 | + def _resolve_round_robin_gpu_spec(self, gpu_count_per_sandbox: int) -> str | None: |
| 187 | + """Allocate device ids in round-robin across host GPUs.""" |
| 188 | + total_gpus = self._detect_gpu_count() |
| 189 | + if total_gpus <= 0: |
| 190 | + logger.warning("GPU round-robin requested but no GPUs detected on host") |
| 191 | + return None |
| 192 | + |
| 193 | + per_sandbox = max(1, min(int(gpu_count_per_sandbox), total_gpus)) |
| 194 | + counter_path = os.getenv("ROCK_GPU_COUNTER_PATH", "/tmp/rock_gpu_rr_counter") |
| 195 | + os.makedirs(os.path.dirname(counter_path) or ".", exist_ok=True) |
| 196 | + |
| 197 | + with open(counter_path, "a+", encoding="utf-8") as fp: |
| 198 | + fcntl.flock(fp.fileno(), fcntl.LOCK_EX) |
| 199 | + try: |
| 200 | + fp.seek(0) |
| 201 | + raw = fp.read().strip() |
| 202 | + counter = int(raw) if raw.isdigit() else 0 |
| 203 | + start = counter % total_gpus |
| 204 | + next_counter = counter + per_sandbox |
| 205 | + fp.seek(0) |
| 206 | + fp.truncate() |
| 207 | + fp.write(str(next_counter)) |
| 208 | + fp.flush() |
| 209 | + finally: |
| 210 | + fcntl.flock(fp.fileno(), fcntl.LOCK_UN) |
| 211 | + |
| 212 | + device_ids = [(start + i) % total_gpus for i in range(per_sandbox)] |
| 213 | + return "device=" + ",".join(str(i) for i in device_ids) |
| 214 | + |
| 215 | + def _build_gpu_args(self) -> list[str]: |
| 216 | + """Build GPU-related docker args from runtime config and ROCK_* env vars.""" |
| 217 | + self._resolved_gpu_spec = None |
| 218 | + if any(arg == "--gpus" or arg.startswith("--gpus=") for arg in self._config.docker_args): |
| 219 | + return [] |
| 220 | + |
| 221 | + runtime_enabled = bool(getattr(self._config.runtime_config, "enable_gpu_passthrough", False)) |
| 222 | + env_enabled = os.getenv("ROCK_ENABLE_GPU_PASSTHROUGH", "").strip().lower() in {"1", "true", "yes", "on"} |
| 223 | + if not (runtime_enabled or env_enabled): |
| 224 | + return [] |
| 225 | + |
| 226 | + runtime_mode = str(getattr(self._config.runtime_config, "gpu_allocation_mode", "")).strip().lower() |
| 227 | + mode = runtime_mode or os.getenv("ROCK_GPU_ALLOCATION_MODE", "fixed").strip().lower() or "fixed" |
| 228 | + |
| 229 | + gpu_spec: str | None |
| 230 | + if mode == "round_robin": |
| 231 | + runtime_count = int(getattr(self._config.runtime_config, "gpu_count_per_sandbox", 1) or 1) |
| 232 | + env_count_raw = os.getenv("ROCK_GPU_COUNT_PER_SANDBOX", "").strip() |
| 233 | + env_count = int(env_count_raw) if env_count_raw.isdigit() else None |
| 234 | + per_sandbox = env_count or runtime_count |
| 235 | + gpu_spec = self._resolve_round_robin_gpu_spec(per_sandbox) |
| 236 | + if not gpu_spec: |
| 237 | + return [] |
| 238 | + logger.info(f"GPU pass-through round-robin enabled: --gpus {gpu_spec}") |
| 239 | + else: |
| 240 | + runtime_gpu_spec = str(getattr(self._config.runtime_config, "gpu_device_request", "")).strip() |
| 241 | + gpu_spec = runtime_gpu_spec or (os.getenv("ROCK_GPU_DEVICE_REQUEST", "all").strip() or "all") |
| 242 | + logger.info(f"GPU pass-through fixed mode enabled: --gpus {gpu_spec}") |
| 243 | + |
| 244 | + self._resolved_gpu_spec = gpu_spec |
| 245 | + return ["--gpus", gpu_spec] |
| 246 | + |
| 247 | + def _build_gpu_env_args(self) -> list[str]: |
| 248 | + """Inject visibility env vars for deterministic GPU assignment.""" |
| 249 | + if not self._resolved_gpu_spec: |
| 250 | + return [] |
| 251 | + if self._resolved_gpu_spec == "all": |
| 252 | + return [] |
| 253 | + if self._resolved_gpu_spec.startswith("device="): |
| 254 | + devices = self._resolved_gpu_spec.split("=", 1)[1] |
| 255 | + if devices: |
| 256 | + return [ |
| 257 | + "-e", |
| 258 | + f"NVIDIA_VISIBLE_DEVICES={devices}", |
| 259 | + "-e", |
| 260 | + f"CUDA_VISIBLE_DEVICES={devices}", |
| 261 | + ] |
| 262 | + return [] |
| 263 | + |
172 | 264 | def _get_rocklet_start_cmd(self) -> list[str]: |
173 | 265 | cmd = self._runtime_env.get_rocklet_start_cmd() |
174 | 266 |
|
@@ -342,15 +434,19 @@ async def start(self): |
342 | 434 |
|
343 | 435 | time.sleep(random.randint(0, 5)) |
344 | 436 | runtime_args = self._build_runtime_args() |
| 437 | + gpu_args = self._build_gpu_args() |
| 438 | + gpu_env_args = self._build_gpu_env_args() |
345 | 439 | cmds = [ |
346 | 440 | "docker", |
347 | 441 | "run", |
348 | 442 | "--entrypoint", |
349 | 443 | "", |
350 | 444 | *env_arg, |
| 445 | + *gpu_env_args, |
351 | 446 | *rm_arg, |
352 | 447 | *volume_args, |
353 | 448 | *runtime_args, |
| 449 | + *gpu_args, |
354 | 450 | "-p", |
355 | 451 | f"{self._config.port}:{Port.PROXY}", |
356 | 452 | "-p", |
|
0 commit comments