Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions rock-conf/rock-local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@ ray:
warmup:
images:
- "python:3.11"

runtime:
enable_gpu_passthrough: true
gpu_allocation_mode: "round_robin"
gpu_count_per_sandbox: 1
gpu_device_request: "all"
4 changes: 4 additions & 0 deletions rock/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@ class RuntimeConfig:
use_standard_spec_only: bool = False
metrics_endpoint: str = ""
user_defined_tags: dict = field(default_factory=dict)
enable_gpu_passthrough: bool = False
gpu_device_request: str = "all"
gpu_allocation_mode: str = "fixed"
gpu_count_per_sandbox: int = 1

def __post_init__(self) -> None:
# Convert dict to StandardSpec if needed
Expand Down
96 changes: 96 additions & 0 deletions rock/deployments/docker.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import datetime
import fcntl
import os
import random
import shlex
Expand Down Expand Up @@ -64,6 +65,7 @@ def __init__(
self._stop_time = datetime.datetime.now() + datetime.timedelta(minutes=self._config.auto_clear_time)
self._check_stop_task = None
self._container_name = None
self._resolved_gpu_spec: str | None = None
self._service_status = PersistedServiceStatus()
if self._config.container_name:
self.set_container_name(self._config.container_name)
Expand Down Expand Up @@ -169,6 +171,96 @@ def _build_runtime_args(self) -> list[str]:
]
return ["--privileged"]

def _detect_gpu_count(self) -> int:
"""Detect the number of GPUs visible on the Docker host."""
try:
out = subprocess.check_output(
["nvidia-smi", "--list-gpus"],
text=True,
stderr=subprocess.DEVNULL,
)
return len([line for line in out.splitlines() if line.strip()])
except Exception:
return 0

def _resolve_round_robin_gpu_spec(self, gpu_count_per_sandbox: int) -> str | None:
"""Allocate device ids in round-robin across host GPUs."""
total_gpus = self._detect_gpu_count()
if total_gpus <= 0:
logger.warning("GPU round-robin requested but no GPUs detected on host")
return None

per_sandbox = max(1, min(int(gpu_count_per_sandbox), total_gpus))
counter_path = os.getenv("ROCK_GPU_COUNTER_PATH", "/tmp/rock_gpu_rr_counter")
os.makedirs(os.path.dirname(counter_path) or ".", exist_ok=True)

with open(counter_path, "a+", encoding="utf-8") as fp:
fcntl.flock(fp.fileno(), fcntl.LOCK_EX)
try:
fp.seek(0)
raw = fp.read().strip()
counter = int(raw) if raw.isdigit() else 0
start = counter % total_gpus
next_counter = counter + per_sandbox
fp.seek(0)
fp.truncate()
fp.write(str(next_counter))
fp.flush()
finally:
fcntl.flock(fp.fileno(), fcntl.LOCK_UN)

device_ids = [(start + i) % total_gpus for i in range(per_sandbox)]
return "device=" + ",".join(str(i) for i in device_ids)

def _build_gpu_args(self) -> list[str]:
"""Build GPU-related docker args from runtime config and ROCK_* env vars."""
self._resolved_gpu_spec = None
if any(arg == "--gpus" or arg.startswith("--gpus=") for arg in self._config.docker_args):
return []

runtime_enabled = bool(getattr(self._config.runtime_config, "enable_gpu_passthrough", False))
env_enabled = os.getenv("ROCK_ENABLE_GPU_PASSTHROUGH", "").strip().lower() in {"1", "true", "yes", "on"}
if not (runtime_enabled or env_enabled):
return []

runtime_mode = str(getattr(self._config.runtime_config, "gpu_allocation_mode", "")).strip().lower()
mode = runtime_mode or os.getenv("ROCK_GPU_ALLOCATION_MODE", "fixed").strip().lower() or "fixed"

gpu_spec: str | None
if mode == "round_robin":
runtime_count = int(getattr(self._config.runtime_config, "gpu_count_per_sandbox", 1) or 1)
env_count_raw = os.getenv("ROCK_GPU_COUNT_PER_SANDBOX", "").strip()
env_count = int(env_count_raw) if env_count_raw.isdigit() else None
per_sandbox = env_count or runtime_count
gpu_spec = self._resolve_round_robin_gpu_spec(per_sandbox)
if not gpu_spec:
return []
logger.info(f"GPU pass-through round-robin enabled: --gpus {gpu_spec}")
else:
runtime_gpu_spec = str(getattr(self._config.runtime_config, "gpu_device_request", "")).strip()
gpu_spec = runtime_gpu_spec or (os.getenv("ROCK_GPU_DEVICE_REQUEST", "all").strip() or "all")
logger.info(f"GPU pass-through fixed mode enabled: --gpus {gpu_spec}")

self._resolved_gpu_spec = gpu_spec
return ["--gpus", gpu_spec]

def _build_gpu_env_args(self) -> list[str]:
"""Inject visibility env vars for deterministic GPU assignment."""
if not self._resolved_gpu_spec:
return []
if self._resolved_gpu_spec == "all":
return []
if self._resolved_gpu_spec.startswith("device="):
devices = self._resolved_gpu_spec.split("=", 1)[1]
if devices:
return [
"-e",
f"NVIDIA_VISIBLE_DEVICES={devices}",
"-e",
f"CUDA_VISIBLE_DEVICES={devices}",
]
return []

def _get_rocklet_start_cmd(self) -> list[str]:
cmd = self._runtime_env.get_rocklet_start_cmd()

Expand Down Expand Up @@ -342,15 +434,19 @@ async def start(self):

time.sleep(random.randint(0, 5))
runtime_args = self._build_runtime_args()
gpu_args = self._build_gpu_args()
gpu_env_args = self._build_gpu_env_args()
cmds = [
"docker",
"run",
"--entrypoint",
"",
*env_arg,
*gpu_env_args,
*rm_arg,
*volume_args,
*runtime_args,
*gpu_args,
"-p",
f"{self._config.port}:{Port.PROXY}",
"-p",
Expand Down
58 changes: 58 additions & 0 deletions tests/unit/rocklet/test_docker_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
Command,
CreateBashSessionRequest,
)
from rock.config import RuntimeConfig
from rock.deployments.config import DockerDeploymentConfig, get_deployment
from rock.deployments.docker import DockerDeployment


@pytest.mark.need_docker
async def test_docker_deployment(container_name):
Expand Down Expand Up @@ -63,3 +66,58 @@ def test_docker_deployment_config_platform():
config = DockerDeploymentConfig(platform="linux/amd64", docker_args=["--platform", "linux/amd64"])
with pytest.raises(ValueError):
config = DockerDeploymentConfig(platform="linux/amd64", docker_args=["--platform=linux/amd64"])


def test_build_gpu_args_disabled_by_default(monkeypatch):
deployment = DockerDeployment(runtime_config=RuntimeConfig())

monkeypatch.delenv("ROCK_ENABLE_GPU_PASSTHROUGH", raising=False)

assert deployment._build_gpu_args() == []
assert deployment._build_gpu_env_args() == []


def test_build_gpu_args_fixed_mode_from_runtime():
deployment = DockerDeployment(
runtime_config=RuntimeConfig(
enable_gpu_passthrough=True,
gpu_allocation_mode="fixed",
gpu_device_request="device=2",
)
)

assert deployment._build_gpu_args() == ["--gpus", "device=2"]
assert deployment._build_gpu_env_args() == [
"-e",
"NVIDIA_VISIBLE_DEVICES=2",
"-e",
"CUDA_VISIBLE_DEVICES=2",
]


def test_build_gpu_args_skips_when_docker_args_already_set():
deployment = DockerDeployment(
docker_args=["--gpus", "all"],
runtime_config=RuntimeConfig(enable_gpu_passthrough=True),
)

assert deployment._build_gpu_args() == []


def test_build_gpu_args_round_robin(monkeypatch, tmp_path):
deployment = DockerDeployment(
runtime_config=RuntimeConfig(
enable_gpu_passthrough=True,
gpu_allocation_mode="round_robin",
gpu_count_per_sandbox=2,
)
)

monkeypatch.setattr(deployment, "_detect_gpu_count", lambda: 4)
monkeypatch.setenv("ROCK_GPU_COUNTER_PATH", str(tmp_path / "gpu_rr_counter"))

first = deployment._resolve_round_robin_gpu_spec(2)
second = deployment._resolve_round_robin_gpu_spec(2)

assert first == "device=0,1"
assert second == "device=2,3"
19 changes: 19 additions & 0 deletions tests/unit/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,25 @@ async def test_runtime_config():
assert runtime_config.max_allowed_spec.cpus == 16
assert runtime_config.standard_spec.memory == "8g"
assert runtime_config.standard_spec.cpus == 2
assert runtime_config.enable_gpu_passthrough is False
assert runtime_config.gpu_device_request == "all"
assert runtime_config.gpu_allocation_mode == "fixed"
assert runtime_config.gpu_count_per_sandbox == 1


@pytest.mark.asyncio
async def test_runtime_config_gpu_fields():
runtime_config = RuntimeConfig(
enable_gpu_passthrough=True,
gpu_device_request="device=1",
gpu_allocation_mode="round_robin",
gpu_count_per_sandbox=2,
)

assert runtime_config.enable_gpu_passthrough is True
assert runtime_config.gpu_device_request == "device=1"
assert runtime_config.gpu_allocation_mode == "round_robin"
assert runtime_config.gpu_count_per_sandbox == 2

config_full = {
"standard_spec": {
Expand Down
Loading