Skip to content

Commit 7fb21a8

Browse files
committed
feat(sandbox): support starting sandbox from env_dir (Dockerfile build context)
Signed-off-by: Jiachen Zhang <zjc462490@alibaba-inc.com>
1 parent 02310d8 commit 7fb21a8

File tree

8 files changed

+263
-15
lines changed

8 files changed

+263
-15
lines changed

rock/admin/entrypoints/sandbox_api.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import Annotated, Any
22

3+
import ray
34
from fastapi import APIRouter, Body, Depends, File, Form, UploadFile
45

56
from rock.actions import (
@@ -29,6 +30,27 @@
2930
from rock.deployments.config import DockerDeploymentConfig
3031
from rock.sandbox.sandbox_manager import SandboxManager
3132
from rock.utils import handle_exceptions
33+
from rock import env_vars
34+
35+
36+
def _sandbox_start_request_from_form(
37+
image: str = Form(""),
38+
memory: str = Form("8g"),
39+
cpus: float = Form(2),
40+
auto_clear_time_minutes: int = Form(env_vars.ROCK_DEFAULT_AUTO_CLEAR_TIME_MINUTES),
41+
startup_timeout: float = Form(300.0),
42+
pull: str = Form("missing"),
43+
sandbox_id: str | None = Form(None),
44+
) -> SandboxStartRequest:
45+
return SandboxStartRequest(
46+
image=image,
47+
memory=memory,
48+
cpus=cpus,
49+
auto_clear_time_minutes=auto_clear_time_minutes,
50+
startup_timeout=startup_timeout,
51+
pull=pull,
52+
sandbox_id=sandbox_id or None,
53+
)
3254

3355
sandbox_router = APIRouter()
3456
sandbox_manager: SandboxManager
@@ -73,6 +95,25 @@ async def start_async(
7395
return RockResponse(result=sandbox_start_response)
7496

7597

98+
@sandbox_router.post("/start_async_with_env")
99+
@handle_exceptions(error_message="async start sandbox with env_dir failed")
100+
async def start_async_with_env(
101+
request: Annotated[SandboxStartRequest, Depends(_sandbox_start_request_from_form)],
102+
headers: Annotated[StartHeaders, Depends()],
103+
env_dir_tar: UploadFile = File(...),
104+
) -> RockResponse[SandboxStartResponse]:
105+
config = DockerDeploymentConfig.from_request(request)
106+
tar_bytes = await env_dir_tar.read()
107+
config.env_dir_tar_ref = ray.put(tar_bytes)
108+
await _apply_kata_runtime_switch(config)
109+
sandbox_start_response = await sandbox_manager.start_async(
110+
config,
111+
user_info=headers.user_info,
112+
cluster_info=headers.cluster_info,
113+
)
114+
return RockResponse(result=sandbox_start_response)
115+
116+
76117
@sandbox_router.get("/is_alive")
77118
@handle_exceptions(error_message="get sandbox is alive failed")
78119
async def is_alive(sandbox_id: str):

rock/deployments/config.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"""
88

99
from abc import abstractmethod
10-
from typing import Literal
10+
from typing import Any, Literal
1111

1212
from pydantic import BaseModel, ConfigDict, Field, model_validator
1313

@@ -115,6 +115,9 @@ class DockerDeploymentConfig(DeploymentConfig):
115115
extended_params: dict[str, str] = Field(default_factory=dict)
116116
"""Generic extension field for storing custom string key-value pairs."""
117117

118+
env_dir_tar_ref: Any = None
119+
"""Optional Ray ObjectRef of tar.gz bytes (build context). When set, worker extracts and runs docker build before run."""
120+
118121
@model_validator(mode="before")
119122
def validate_platform_args(cls, data: dict) -> dict:
120123
"""Validate and extract platform arguments from docker_args.

rock/deployments/docker.py

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,17 @@
44
import random
55
import shlex
66
import subprocess
7+
import tarfile
8+
import tempfile
79
import time
810
import uuid
911
from concurrent.futures import ThreadPoolExecutor
12+
from io import BytesIO
1013
from pathlib import Path
1114
from typing import Any
1215

16+
import ray
17+
1318
from typing_extensions import Self
1419

1520
from rock import env_vars
@@ -20,7 +25,7 @@
2025
from rock.deployments.hooks.abstract import CombinedDeploymentHook, DeploymentHook
2126
from rock.deployments.runtime_env import DockerRuntimeEnv, LocalRuntimeEnv, PipRuntimeEnv, UvRuntimeEnv
2227
from rock.deployments.sandbox_validator import DockerSandboxValidator
23-
from rock.deployments.status import PersistedServiceStatus, ServiceStatus
28+
from rock.deployments.status import PhaseStatus, PersistedServiceStatus, ServiceStatus
2429
from rock.logger import init_logger
2530
from rock.rocklet import PACKAGE_NAME, REMOTE_EXECUTABLE_NAME
2631
from rock.rocklet.exceptions import DeploymentNotStartedError, DockerPullError
@@ -63,6 +68,8 @@ def __init__(
6368
self._check_stop_task = None
6469
self._container_name = None
6570
self._service_status = PersistedServiceStatus()
71+
if getattr(self._config, "env_dir_tar_ref", None) is not None:
72+
self._service_status.add_phase("docker_build", PhaseStatus())
6673
if self._config.container_name:
6774
self.set_container_name(self._config.container_name)
6875
if env_vars.ROCK_WORKER_ENV_TYPE == "docker":
@@ -285,6 +292,40 @@ def _memory(self):
285292
def _cpus(self):
286293
return [f"--cpus={self.config.cpus}"]
287294

295+
def _build_image_from_env_dir_tar(self, tar_bytes: bytes) -> str:
296+
"""Extract env_dir tar.gz and run docker build; return image tag (self._config.image)."""
297+
self._service_status.update_status(
298+
phase_name="docker_build", status=Status.RUNNING, message="building from env_dir context"
299+
)
300+
with tempfile.TemporaryDirectory(prefix="rock_env_") as tmpdir:
301+
path = Path(tmpdir)
302+
with tarfile.open(fileobj=BytesIO(tar_bytes), mode="r:gz") as tar:
303+
tar.extractall(path)
304+
context_dir = path
305+
dockerfile_path = context_dir / "Dockerfile"
306+
if not dockerfile_path.exists():
307+
raise FileNotFoundError(f"Dockerfile not found in env_dir context: {dockerfile_path}")
308+
build_cmd = [
309+
"docker",
310+
"build",
311+
"-f",
312+
str(dockerfile_path),
313+
"-t",
314+
self._config.image,
315+
str(context_dir),
316+
]
317+
logger.info("Running docker build from env_dir context: %s", shlex.join(build_cmd))
318+
try:
319+
subprocess.run(build_cmd, check=True, capture_output=True, timeout=1800)
320+
except subprocess.CalledProcessError as e:
321+
msg = f"docker build failed: {e.stderr.decode() if e.stderr else str(e)}"
322+
self._service_status.update_status(phase_name="docker_build", status=Status.FAILED, message=msg)
323+
raise RuntimeError(msg) from e
324+
self._service_status.update_status(
325+
phase_name="docker_build", status=Status.SUCCESS, message="docker build success"
326+
)
327+
return self._config.image
328+
288329
async def start(self):
289330
"""Starts the runtime."""
290331
if not self.sandbox_validator.check_availability():
@@ -295,11 +336,18 @@ async def start(self):
295336
self._service_status.set_sandbox_id(self._container_name)
296337
executor = get_executor()
297338
loop = asyncio.get_running_loop()
298-
await loop.run_in_executor(executor, self._pull_image)
299-
if self._config.python_standalone_dir is not None:
300-
image_id = self._build_image()
339+
340+
if self._config.env_dir_tar_ref is not None:
341+
tar_bytes = ray.get(self._config.env_dir_tar_ref)
342+
image_id = await loop.run_in_executor(
343+
executor, self._build_image_from_env_dir_tar, tar_bytes
344+
)
301345
else:
302-
image_id = self._config.image
346+
await loop.run_in_executor(executor, self._pull_image)
347+
if self._config.python_standalone_dir is not None:
348+
image_id = self._build_image()
349+
else:
350+
image_id = self._config.image
303351

304352
if not self.sandbox_validator.check_resource(image_id):
305353
raise Exception(f"Image {image_id} is not valid")

rock/sandbox/operator/ray.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,12 @@ def _generate_actor_options(self, config: DockerDeploymentConfig) -> dict:
5252
async def submit(self, config: DockerDeploymentConfig, user_info: dict = {}) -> SandboxInfo:
5353
async with self._ray_service.get_ray_rwlock().read_lock():
5454
sandbox_id = config.container_name
55-
logger.info(f"[{sandbox_id}] start_async params:{json.dumps(config.model_dump(), indent=2)}")
55+
log_params = config.model_dump()
56+
# env_dir_tar_ref is a Ray ObjectRef, not JSON-serializable
57+
if "env_dir_tar_ref" in log_params and log_params["env_dir_tar_ref"] is not None:
58+
log_params = {k: v for k, v in log_params.items() if k != "env_dir_tar_ref"}
59+
log_params["env_dir_tar_ref"] = "<ObjectRef>"
60+
logger.info(f"[{sandbox_id}] start_async params:{json.dumps(log_params, indent=2)}")
5661
sandbox_actor: SandboxActor = await self.create_actor(config)
5762
sandbox_actor.start.remote()
5863
user_id = user_info.get("user_id", "default")

rock/sandbox/sandbox_manager.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ async def start_async(
116116
self.validate_sandbox_spec(self.rock_config.runtime, config)
117117
docker_deployment_config: DockerDeploymentConfig = await self.deployment_manager.init_config(config)
118118
sandbox_id = docker_deployment_config.container_name
119+
if getattr(docker_deployment_config, "env_dir_tar_ref", None) is not None and not (docker_deployment_config.image or "").strip():
120+
docker_deployment_config.image = f"rock_env_image:{sandbox_id}"
119121
sandbox_info: SandboxInfo = await self._operator.submit(docker_deployment_config, user_info)
120122
stop_time = str(int(time.time()) + docker_deployment_config.auto_clear_time * 60)
121123
auto_clear_time_dict = {

rock/sdk/sandbox/client.py

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import asyncio
2+
import io
23
import logging
34
import mimetypes
45
import os
6+
import tarfile
57
import time
68
import uuid
79
import warnings
@@ -160,22 +162,52 @@ async def _parse_error_message_from_status(self, status: dict):
160162
# If no failed stage is found, return None
161163
return None
162164

165+
@staticmethod
166+
def _pack_env_dir_to_tar_gz(env_dir: Path) -> bytes:
167+
"""Pack a directory (docker build context) into a gzipped tar as bytes."""
168+
buf = io.BytesIO()
169+
env_dir = Path(env_dir).resolve()
170+
if not env_dir.is_dir():
171+
raise ValueError(f"env_dir is not a directory: {env_dir}")
172+
dockerfile = env_dir / "Dockerfile"
173+
if not dockerfile.exists():
174+
raise ValueError(f"Dockerfile not found in env_dir: {dockerfile}")
175+
with tarfile.open(fileobj=buf, mode="w:gz") as tar:
176+
tar.add(env_dir, arcname=".", filter=lambda ti: None if ti.name == ".git" else ti)
177+
return buf.getvalue()
178+
163179
async def start(self):
164-
url = f"{self._url}/start_async"
165180
headers = self._build_headers()
166-
data = {
167-
"image": self.config.image,
168-
"image_os": self.config.image_os,
181+
form_data = {
169182
"auto_clear_time": self.config.auto_clear_seconds / 60,
170183
"auto_clear_time_minutes": self.config.auto_clear_seconds / 60,
171184
"startup_timeout": self.config.startup_timeout,
172185
"memory": self.config.memory,
173186
"cpus": self.config.cpus,
174187
}
175-
try:
176-
response = await HttpUtils.post(url, headers, data)
177-
except Exception as e:
178-
raise Exception(f"Failed to start standbox: {str(e)}, post url {url}")
188+
if not self.config.env_dir:
189+
form_data["image"] = self.config.image
190+
form_data["image_os"] = self.config.image_os
191+
if self.config.env_dir:
192+
# env_dir build: server uses rock_env_image:<sandbox_id> as image tag
193+
env_dir = Path(self.config.env_dir).resolve()
194+
tar_bytes = self._pack_env_dir_to_tar_gz(env_dir)
195+
url = f"{self._url}/start_async_with_env"
196+
try:
197+
response = await HttpUtils.post_multipart(
198+
url,
199+
headers,
200+
data=form_data,
201+
files={"env_dir_tar": ("env_dir.tar.gz", tar_bytes, "application/gzip")},
202+
)
203+
except Exception as e:
204+
raise Exception(f"Failed to start sandbox with env_dir: {str(e)}, post url {url}")
205+
else:
206+
url = f"{self._url}/start_async"
207+
try:
208+
response = await HttpUtils.post(url, headers, form_data)
209+
except Exception as e:
210+
raise Exception(f"Failed to start sandbox: {str(e)}, post url {url}")
179211

180212
logging.debug(f"Start sandbox response: {response}")
181213
if "Success" != response.get("status"):

rock/sdk/sandbox/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import warnings
2+
from pathlib import Path
3+
from typing import Union
24

35
from pydantic import BaseModel, Field, field_validator
46

@@ -39,6 +41,8 @@ class SandboxConfig(BaseConfig):
3941
experiment_id: str | None = None
4042
cluster: str = "zb"
4143
namespace: str | None = None
44+
env_dir: Union[str, Path, None] = None
45+
"""Optional path to docker build context directory (must contain Dockerfile). When set, SDK packs it as tar.gz and uploads; worker will docker build then run. Image tag is server-derived as rock_env_image:<sandbox_id>."""
4246

4347

4448
class SandboxGroupConfig(SandboxConfig):

0 commit comments

Comments
 (0)